===================================================================
@@ -0,0 +1,23 @@
+void
+f (int i, float j, int k)
+{
+#pragma acc parallel num_gangs (i) num_workers (i) vector_length (i)
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc parallel num_gangs (j) /* { dg-error "'num_gangs' expression must be integral" } */
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc parallel num_workers (j) /* { dg-error "'num_workers' expression must be integral" } */
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc parallel vector_length (j) /* { dg-error "'vector_length' expression must be integral" } */
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+}
===================================================================
@@ -0,0 +1,70 @@
+void
+f (int i, int j, int k)
+{
+#pragma acc kernels
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels
+#pragma acc loop gang (num: 10)
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 10)
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10)
+ for (i = 0; i < 20; ++i)
+ ;
+
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10, *) /* { dg-error "duplicate operand to clause" } */
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10, static: *) /* { dg-error "duplicate 'num' argument" } */
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels
+#pragma acc loop worker (static: 234) /* { dg-error "expected 'num' before" } */
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels
+#pragma acc loop worker (num: 234)
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels
+#pragma acc loop worker (num: 234, num: 12) /* { dg-error "duplicate operand to clause" } */
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels
+#pragma acc loop vector /* { dg-error "gang, worker and vector must occur in this order in a loop nest" } */
+ for (i = 0; i < 20; ++i)
+#pragma acc loop worker
+ for (j = 0; j < 25; ++j)
+ ;
+
+#pragma acc kernels
+#pragma acc loop worker (length: 20) /* { dg-error "expected 'num' before 'length'" } */
+ for (i = 0; i < 20; ++i)
+#pragma acc loop vector (length: 10)
+ for (j = 0; j < 25; ++j)
+ ;
+
+#pragma acc kernels
+#pragma acc loop worker
+ for (i = 0; i < 20; ++i)
+#pragma acc loop vector
+ for (j = 0; j < 25; ++j)
+ ;
+}
===================================================================
@@ -0,0 +1,43 @@
+void
+f (int i, int j, int k)
+{
+#pragma acc kernels num_gangs (10) /* { dg-error "'num_gangs' is not valid" } */
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels num_workers (10) /* { dg-error "'num_workers' is not valid" } */
+#pragma acc loop worker
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc kernels vector_length (10) /* { dg-error "'vector_length' is not valid" } */
+#pragma acc loop vector
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc parallel num_gangs (10) num_workers (20) vector_length (32)
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc parallel num_gangs (i) num_workers (j) vector_length (k)
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc parallel num_gangs (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc parallel num_workers (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+
+#pragma acc parallel vector_length (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+ for (i = 0; i < 20; ++i)
+ ;
+}
===================================================================
@@ -54,7 +54,7 @@ main (int argc, char **argv)
#pragma acc enter data copyin (a[0:N]) async
#pragma acc enter data copyin (b[0:N]) async wait
#pragma acc enter data copyin (N) async wait
-#pragma acc parallel async wait
+#pragma acc parallel async wait present (a[0:N]) present (b[0:N]) present (N)
#pragma acc loop
for (i = 0; i < N; i++)
b[i] = a[i];
===================================================================
@@ -9,48 +9,16 @@
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
CUstream stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "./subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
- abort ();
- }
-
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
- r = cuModuleLoad (&module, "subr.ptx");
- if (r != CUDA_SUCCESS)
- {
fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
@@ -62,20 +30,6 @@ main (int argc, char **argv)
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
-
- dtime = 200.0;
-
- dticks = (unsigned long) (dtime * clkrate);
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
stream = (CUstream) acc_get_cuda_stream (0);
if (stream != NULL)
abort ();
@@ -90,7 +44,7 @@ main (int argc, char **argv)
if (!acc_set_cuda_stream (0, stream))
abort ();
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -97,25 +51,15 @@ main (int argc, char **argv)
abort ();
}
- if (acc_async_test (0) != 0)
- {
- fprintf (stderr, "asynchronous operation not running\n");
- abort ();
- }
+ if (acc_async_test (0) == 1)
+ fprintf (stderr, "expected asynchronous operation to be running\n");
- sleep (1);
+ acc_wait_all ();
- if (acc_async_test (0) != 1)
- {
- fprintf (stderr, "found asynchronous operation still running\n");
- abort ();
- }
+ if (acc_async_test (0) == 0)
+ fprintf (stderr, "expected asynchronous operation to be running\n");
- acc_unmap_data (a);
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
exit (0);
===================================================================
@@ -1,6 +1,7 @@
/* { dg-do run { target openacc_nvidia_accel_selected } } */
/* { dg-additional-options "-lcuda" } */
+#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
@@ -10,47 +11,17 @@
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
- const int N = 10;
+ const int N = 3;
int i;
CUstream streams[N];
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t diff;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
- abort ();
- }
-
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
@@ -65,20 +36,6 @@ main (int argc, char **argv)
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
-
- dtime = 200.0;
-
- dticks = (unsigned long) (dtime * clkrate);
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
for (i = 0; i < N; i++)
{
streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -96,9 +53,29 @@ main (int argc, char **argv)
abort ();
}
+ gettimeofday (&tv1, NULL);
+
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[0], NULL, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+ abort ();
+ }
+
+ r = cuCtxSynchronize ();
+ if (r != CUDA_SUCCESS)
+ {
+ fprintf (stderr, "cuCtxLaunch failed: %d\n", r);
+ abort ();
+ }
+
+ gettimeofday (&tv2, NULL);
+
+ diff = tv2.tv_sec - tv1.tv_sec;
+
for (i = 0; i < N; i++)
{
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,7 +89,7 @@ main (int argc, char **argv)
}
}
- sleep ((int) (dtime / 1000.0f) + 1);
+ sleep ((diff + 1) * N);
for (i = 0; i < N; i++)
{
@@ -123,11 +100,7 @@ main (int argc, char **argv)
}
}
- acc_unmap_data (a);
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
exit (0);
===================================================================
@@ -9,45 +9,13 @@
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
CUstream stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
- abort ();
- }
-
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
@@ -62,20 +30,6 @@ main (int argc, char **argv)
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
-
- dtime = 200.0;
-
- dticks = (unsigned long) (dtime * clkrate);
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
if (r != CUDA_SUCCESS)
{
@@ -85,7 +39,7 @@ main (int argc, char **argv)
acc_set_cuda_stream (0, stream);
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -99,7 +53,7 @@ main (int argc, char **argv)
abort ();
}
- sleep ((int) (dtime / 1000.0f) + 1);
+ sleep (1);
if (acc_async_test (1) != 1)
{
@@ -107,11 +61,6 @@ main (int argc, char **argv)
abort ();
}
- acc_unmap_data (a);
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
return 0;
===================================================================
@@ -10,45 +10,13 @@
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
CUstream stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
- abort ();
- }
-
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
@@ -63,20 +31,6 @@ main (int argc, char **argv)
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
-
- dtime = 200.0;
-
- dticks = (unsigned long) (dtime * clkrate);
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
if (r != CUDA_SUCCESS)
{
@@ -87,7 +41,7 @@ main (int argc, char **argv)
if (!acc_set_cuda_stream (0, stream))
abort ();
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -100,7 +54,12 @@ main (int argc, char **argv)
abort ();
}
- sleep ((int) (dtime / 1000.f) + 1);
+ r = cuCtxSynchronize ();
+ if (r != CUDA_SUCCESS)
+ {
+ fprintf (stderr, "cuCtxSynchronize () failed: %d\n", r);
+ abort ();
+ }
if (acc_async_test_all () != 1)
{
@@ -108,11 +67,6 @@ main (int argc, char **argv)
abort ();
}
- acc_unmap_data (a);
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
exit (0);
===================================================================
@@ -1,6 +1,7 @@
/* { dg-do run { target openacc_nvidia_accel_selected } } */
/* { dg-additional-options "-lcuda" } */
+#include <sys/time.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
@@ -10,47 +11,15 @@
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
- const int N = 10;
+ const int N = 6;
int i;
CUstream streams[N];
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
- abort ();
- }
-
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
@@ -65,20 +34,6 @@ main (int argc, char **argv)
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
-
- dtime = 200.0;
-
- dticks = (unsigned long) (dtime * clkrate);
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
for (i = 0; i < N; i++)
{
streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -98,13 +53,12 @@ main (int argc, char **argv)
for (i = 0; i < N; i++)
{
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
-
}
if (acc_async_test_all () != 0)
@@ -113,7 +67,12 @@ main (int argc, char **argv)
abort ();
}
- sleep ((int) (dtime / 1000.0f) + 1);
+ r = cuCtxSynchronize ();
+ if (r != CUDA_SUCCESS)
+ {
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+ abort ();
+ }
if (acc_async_test_all () != 1)
{
@@ -121,11 +80,6 @@ main (int argc, char **argv)
abort ();
}
- acc_unmap_data (a);
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
exit (0);
===================================================================
@@ -5,78 +5,54 @@
#include <stdlib.h>
#include <openacc.h>
#include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
CUstream stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float atime, dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t t1, t2;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+ fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
+ r = cuModuleGetFunction (&delay, module, "delay");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+ fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
abort ();
}
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
+ gettimeofday (&tv1, NULL);
- r = cuModuleLoad (&module, "subr.ptx");
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
- r = cuModuleGetFunction (&delay, module, "delay");
+ r = cuCtxSynchronize ();
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
+ gettimeofday (&tv2, NULL);
- dtime = 200.0;
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- dticks = (unsigned long) (dtime * clkrate);
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
stream = (CUstream) acc_get_cuda_stream (0);
if (stream != NULL)
abort ();
@@ -91,11 +67,9 @@ main (int argc, char **argv)
if (!acc_set_cuda_stream (0, stream))
abort ();
- init_timers (1);
+ gettimeofday (&tv1, NULL);
- start_timer (0);
-
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -104,33 +78,30 @@ main (int argc, char **argv)
acc_wait (0);
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (atime < dtime)
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long 1\n");
abort ();
}
- start_timer (0);
+ gettimeofday (&tv1, NULL);
acc_wait (0);
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (0.010 < atime)
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ if (t2 > 1000)
{
- fprintf (stderr, "actual time too long\n");
+ fprintf (stderr, "too long 2\n");
abort ();
}
- acc_unmap_data (a);
-
- fini_timers ();
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
exit (0);
===================================================================
@@ -6,79 +6,56 @@
#include <stdlib.h>
#include <openacc.h>
#include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
- int N;
+ const int N = 2;
int i;
CUstream stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float atime, dtime, hitime, lotime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t t1, t2;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+ fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
+ r = cuModuleGetFunction (&delay, module, "delay");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+ fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
abort ();
}
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
+ gettimeofday (&tv1, NULL);
- r = cuModuleLoad (&module, "subr.ptx");
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
- r = cuModuleGetFunction (&delay, module, "delay");
+ r = cuCtxSynchronize ();
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
+ gettimeofday (&tv2, NULL);
- dtime = 200.0;
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- dticks = (unsigned long) (dtime * clkrate);
-
- N = nprocs;
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
stream = (CUstream) acc_get_cuda_stream (0);
if (stream != NULL)
abort ();
@@ -93,16 +70,11 @@ main (int argc, char **argv)
if (!acc_set_cuda_stream (0, stream))
abort ();
- init_timers (1);
+ gettimeofday (&tv1, NULL);
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
- start_timer (0);
-
for (i = 0; i < N; i++)
{
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,27 +84,18 @@ main (int argc, char **argv)
acc_wait (0);
}
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- hitime = dtime * N;
- hitime += hitime * 0.02;
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- lotime = dtime * N;
- lotime -= lotime * 0.02;
+ t1 *= N;
- if (atime > hitime || atime < lotime)
+ if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long\n");
abort ();
}
- acc_unmap_data (a);
-
- fini_timers ();
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
exit (0);
===================================================================
@@ -6,79 +6,56 @@
#include <unistd.h>
#include <openacc.h>
#include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
- int N;
+ const int N = 2;
int i;
CUstream *streams;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float atime, dtime, hitime, lotime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t t1, t2;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+ fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
+ r = cuModuleGetFunction (&delay, module, "delay");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+ fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
abort ();
}
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
+ gettimeofday (&tv1, NULL);
- r = cuModuleLoad (&module, "subr.ptx");
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
- r = cuModuleGetFunction (&delay, module, "delay");
+ r = cuCtxSynchronize ();
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
+ gettimeofday (&tv2, NULL);
- dtime = 200.0;
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- dticks = (unsigned long) (dtime * clkrate);
-
- N = nprocs;
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
streams = (CUstream *) malloc (N * sizeof (void *));
for (i = 0; i < N; i++)
@@ -98,16 +75,11 @@ main (int argc, char **argv)
abort ();
}
- init_timers (1);
+ gettimeofday (&tv1, NULL);
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
- start_timer (0);
-
for (i = 0; i < N; i++)
{
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -117,27 +89,19 @@ main (int argc, char **argv)
acc_wait (i);
}
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- hitime = dtime * N;
- hitime += hitime * 0.02;
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- lotime = dtime * N;
- lotime -= lotime * 0.02;
+ t1 *= N;
- if (atime > hitime || atime < lotime)
+ if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long\n");
abort ();
}
- acc_unmap_data (a);
-
- fini_timers ();
-
free (streams);
- free (a);
- acc_free (d_a);
acc_shutdown (acc_device_nvidia);
===================================================================
@@ -6,78 +6,54 @@
#include <unistd.h>
#include <openacc.h>
#include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
CUstream stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float atime, dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t t1, t2;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+ fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
+ r = cuModuleGetFunction (&delay, module, "delay");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+ fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
abort ();
}
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
+ gettimeofday (&tv1, NULL);
- r = cuModuleLoad (&module, "subr.ptx");
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
- r = cuModuleGetFunction (&delay, module, "delay");
+ r = cuCtxSynchronize();
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
+ gettimeofday (&tv2, NULL);
- dtime = 200.0;
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- dticks = (unsigned long) (dtime * clkrate);
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
if (r != CUDA_SUCCESS)
{
@@ -87,11 +63,9 @@ main (int argc, char **argv)
acc_set_cuda_stream (0, stream);
- init_timers (1);
+ gettimeofday (&tv1, NULL);
- start_timer (0);
-
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -101,33 +75,30 @@ main (int argc, char **argv)
fprintf (stderr, "CheCKpOInT\n");
acc_wait (1);
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (atime < dtime)
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ if (t2 > t1)
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long 1\n");
abort ();
}
- start_timer (0);
+ gettimeofday (&tv1, NULL);
acc_wait (1);
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (0.010 < atime)
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ if (t2 > 1000)
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long 2\n");
abort ();
}
- acc_unmap_data (a);
-
- fini_timers ();
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
return 0;
===================================================================
@@ -6,78 +6,54 @@
#include <unistd.h>
#include <openacc.h>
#include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
CUstream stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float atime, dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t t1, t2;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+ fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
+ r = cuModuleGetFunction (&delay, module, "delay");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+ fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
abort ();
}
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
+ gettimeofday (&tv1, NULL);
- r = cuModuleLoad (&module, "subr.ptx");
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
- r = cuModuleGetFunction (&delay, module, "delay");
+ r = cuCtxSynchronize ();
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
+ gettimeofday (&tv2, NULL);
- dtime = 200.0;
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- dticks = (unsigned long) (dtime * clkrate);
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
stream = (CUstream) acc_get_cuda_stream (0);
if (stream != NULL)
abort ();
@@ -92,11 +68,9 @@ main (int argc, char **argv)
if (!acc_set_cuda_stream (0, stream))
abort ();
- init_timers (1);
+ gettimeofday (&tv1, NULL);
- start_timer (0);
-
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -105,33 +79,30 @@ main (int argc, char **argv)
acc_wait_all ();
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (atime < dtime)
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ if (t2 > (t1 + (t1 * 0.10)))
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long 1\n");
abort ();
}
- start_timer (0);
+ gettimeofday (&tv1, NULL);
acc_wait_all ();
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (0.010 < atime)
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ if (t2 > 1000)
{
- fprintf (stderr, "actual time too long\n");
+ fprintf (stderr, "too long 2\n");
abort ();
}
- acc_unmap_data (a);
-
- fini_timers ();
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
exit (0);
===================================================================
@@ -6,81 +6,56 @@
#include <unistd.h>
#include <openacc.h>
#include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
- int N;
+ const int N = 2;
int i;
CUstream stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float atime, dtime, hitime, lotime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t t1, t2;
- devnum = 2;
-
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+ fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
+ r = cuModuleGetFunction (&delay, module, "delay");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+ fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
abort ();
}
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
+ gettimeofday (&tv1, NULL);
- r = cuModuleLoad (&module, "subr.ptx");
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
- r = cuModuleGetFunction (&delay, module, "delay");
+ r = cuCtxSynchronize ();
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
+ gettimeofday (&tv2, NULL);
- dtime = 200.0;
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- dticks = (unsigned long) (dtime * clkrate);
-
- N = nprocs;
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
if (r != CUDA_SUCCESS)
{
@@ -105,16 +80,11 @@ main (int argc, char **argv)
if (!acc_set_cuda_stream (0, stream))
abort ();
- init_timers (1);
+ gettimeofday (&tv1, NULL);
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
- start_timer (0);
-
for (i = 0; i < N; i++)
{
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -132,7 +102,7 @@ main (int argc, char **argv)
acc_wait (1);
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
if (acc_async_test (0) != 1)
abort ();
@@ -140,25 +110,16 @@ main (int argc, char **argv)
if (acc_async_test (1) != 1)
abort ();
- hitime = dtime * N;
- hitime += hitime * 0.02;
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- lotime = dtime * N;
- lotime -= lotime * 0.02;
+ t1 *= N;
- if (atime > hitime || atime < lotime)
+ if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long\n");
abort ();
}
- acc_unmap_data (a);
-
- fini_timers ();
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
exit (0);
===================================================================
@@ -6,98 +6,70 @@
#include <unistd.h>
#include <openacc.h>
#include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
CUstream stream;
- int N;
+ const int N = 2;
int i;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float atime, dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t t1, t2;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+ fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
+ r = cuModuleGetFunction (&delay, module, "delay");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+ fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
abort ();
}
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
+ gettimeofday (&tv1, NULL);
- r = cuModuleLoad (&module, "subr.ptx");
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
- r = cuModuleGetFunction (&delay, module, "delay");
+ r = cuCtxSynchronize();
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
+ gettimeofday (&tv2, NULL);
- dtime = 200.0;
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- dticks = (unsigned long) (dtime * clkrate);
-
- N = nprocs;
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuStreamCreate failed: %d\n", r);
- abort ();
- }
+ {
+ fprintf (stderr, "cuStreamCreate failed: %d\n", r);
+ abort ();
+ }
acc_set_cuda_stream (1, stream);
- init_timers (1);
+ gettimeofday (&tv1, NULL);
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
- start_timer (0);
-
for (i = 0; i < N; i++)
{
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -110,21 +82,18 @@ main (int argc, char **argv)
acc_wait (1);
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (atime < dtime)
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ t1 *= N;
+
+ if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long\n");
abort ();
}
- acc_unmap_data (a);
-
- fini_timers ();
-
- free (a);
- acc_free (d_a);
-
acc_shutdown (acc_device_nvidia);
return 0;
===================================================================
@@ -6,79 +6,56 @@
#include <unistd.h>
#include <openacc.h>
#include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay;
CUmodule module;
CUresult r;
- int N;
+ const int N = 2;
int i;
CUstream *streams, stream;
- unsigned long *a, *d_a, dticks;
- int nbytes;
- float atime, dtime;
- void *kargs[2];
- int clkrate;
- int devnum, nprocs;
+ struct timeval tv1, tv2;
+ time_t t1, t2;
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
+ r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+ fprintf (stderr, "cuModuleLoad failed: %d\n", r);
abort ();
}
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
+ r = cuModuleGetFunction (&delay, module, "delay");
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+ fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
abort ();
}
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
+ gettimeofday (&tv1, NULL);
- r = cuModuleLoad (&module, "subr.ptx");
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleLoad failed: %d\n", r);
- abort ();
+ fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+ abort ();
}
- r = cuModuleGetFunction (&delay, module, "delay");
+ r = cuCtxSynchronize ();
if (r != CUDA_SUCCESS)
{
- fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
- abort ();
+ fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+ abort ();
}
- nbytes = nprocs * sizeof (unsigned long);
+ gettimeofday (&tv2, NULL);
- dtime = 500.0;
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- dticks = (unsigned long) (dtime * clkrate);
-
- N = nprocs;
-
- a = (unsigned long *) malloc (nbytes);
- d_a = (unsigned long *) acc_malloc (nbytes);
-
- acc_map_data (a, d_a, nbytes);
-
streams = (CUstream *) malloc (N * sizeof (void *));
for (i = 0; i < N; i++)
@@ -98,11 +75,6 @@ main (int argc, char **argv)
abort ();
}
- init_timers (1);
-
- kargs[0] = (void *) &d_a;
- kargs[1] = (void *) &dticks;
-
stream = (CUstream) acc_get_cuda_stream (N);
if (stream != NULL)
abort ();
@@ -117,11 +89,11 @@ main (int argc, char **argv)
if (!acc_set_cuda_stream (N, stream))
abort ();
- start_timer (0);
+ gettimeofday (&tv1, NULL);
for (i = 0; i < N; i++)
{
- r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+ r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
if (r != CUDA_SUCCESS)
{
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -129,6 +101,10 @@ main (int argc, char **argv)
}
}
+ gettimeofday (&tv2, NULL);
+
+ t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
acc_wait_all_async (N);
for (i = 0; i <= N; i++)
@@ -145,15 +121,13 @@ main (int argc, char **argv)
abort ();
}
- atime = stop_timer (0);
-
- if (atime < dtime)
+ if ((t1 * N) < t2)
{
- fprintf (stderr, "actual time < delay time\n");
+ fprintf (stderr, "too long 1\n");
abort ();
}
- start_timer (0);
+ gettimeofday (&tv1, NULL);
stream = (CUstream) acc_get_cuda_stream (N + 1);
if (stream != NULL)
@@ -173,35 +147,33 @@ main (int argc, char **argv)
acc_wait (N + 1);
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (0.10 < atime)
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ if (t1 > 1000)
{
- fprintf (stderr, "actual time too long\n");
+ fprintf (stderr, "too long 2\n");
abort ();
}
- start_timer (0);
+ gettimeofday (&tv1, NULL);
acc_wait_all_async (N);
acc_wait (N);
- atime = stop_timer (0);
+ gettimeofday (&tv2, NULL);
- if (0.10 < atime)
+ t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+ if (t1 > 1000)
{
- fprintf (stderr, "actual time too long\n");
+ fprintf (stderr, "too long 3\n");
abort ();
}
- acc_unmap_data (a);
-
- fini_timers ();
-
free (streams);
- free (a);
- acc_free (d_a);
acc_shutdown (acc_device_nvidia);
===================================================================
@@ -10,46 +10,18 @@
int
main (int argc, char **argv)
{
- CUdevice dev;
CUfunction delay2;
CUmodule module;
CUresult r;
- int N;
+ const int N = 32;
int i;
CUstream *streams;
- unsigned long **a, **d_a, *tid, ticks;
+ unsigned long **a, **d_a, *tid;
int nbytes;
- void *kargs[3];
- int clkrate;
- int devnum, nprocs;
+ void *kargs[2];
acc_init (acc_device_nvidia);
- devnum = acc_get_device_num (acc_device_nvidia);
-
- r = cuDeviceGet (&dev, devnum);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGet failed: %d\n", r);
- abort ();
- }
-
- r =
- cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
- r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
- abort ();
- }
-
r = cuModuleLoad (&module, "subr.ptx");
if (r != CUDA_SUCCESS)
{
@@ -66,10 +38,6 @@ main (int argc, char **argv)
nbytes = sizeof (int);
- ticks = (unsigned long) (200.0 * clkrate);
-
- N = nprocs;
-
streams = (CUstream *) malloc (N * sizeof (void *));
a = (unsigned long **) malloc (N * sizeof (unsigned long *));
@@ -103,8 +71,7 @@ main (int argc, char **argv)
for (i = 0; i < N; i++)
{
kargs[0] = (void *) &d_a[i];
- kargs[1] = (void *) &ticks;
- kargs[2] = (void *) &tid[i];
+ kargs[1] = (void *) &tid[i];
r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
if (r != CUDA_SUCCESS)
@@ -112,8 +79,6 @@ main (int argc, char **argv)
fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
abort ();
}
-
- ticks = (unsigned long) (50.0 * clkrate);
}
acc_wait_all_async (0);
===================================================================
@@ -0,0 +1,19 @@
+/* { dg-do run { target { ! openacc_nvidia_accel_configured } } } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+int
+main (void)
+{
+ fprintf (stderr, "CheCKpOInT\n");
+ acc_init (acc_device_nvidia);
+
+ acc_shutdown (acc_device_nvidia);
+
+ return 0;
+}
+
+/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
+/* { dg-output "device type nvidia not supported" } */
+/* { dg-shouldfail "" } */
===================================================================
@@ -1,46 +1,24 @@
-#if ACC_DEVICE_TYPE_nvidia
-
#pragma acc routine nohost
-static int clock (void)
-{
- int thetime;
-
- asm __volatile__ ("mov.u32 %0, %%clock;" : "=r"(thetime));
-
- return thetime;
-}
-
-#endif
-
void
-delay (unsigned long *d_o, unsigned long delay)
+delay ()
{
- int start, ticks;
+ int i, sum;
+ const int N = 500000;
- start = clock ();
-
- ticks = 0;
-
- while (ticks < delay)
- ticks = clock () - start;
-
- return;
+ for (i = 0; i < N; i++)
+ sum = sum + 1;
}
+#pragma acc routine nohost
void
-delay2 (unsigned long *d_o, unsigned long delay, unsigned long tid)
+delay2 (unsigned long *d_o, unsigned long tid)
{
- int start, ticks;
+ int i, sum;
+ const int N = 500000;
- start = clock ();
+ for (i = 0; i < N; i++)
+ sum = sum + 1;
- ticks = 0;
-
- while (ticks < delay)
- ticks = clock () - start;
-
d_o[0] = tid;
-
- return;
}
===================================================================
@@ -1,148 +1,90 @@
-// BEGIN PREAMBLE
- .version 3.1
- .target sm_30
+ .version 3.1
+ .target sm_30
.address_size 64
-// END PREAMBLE
-// BEGIN FUNCTION DEF: clock
-.func (.param.u32 %out_retval)clock
-{
-.reg.u32 %retval;
- .reg.u64 %hr10;
- .reg.u32 %r22;
- .reg.u32 %r23;
- .reg.u32 %r24;
- .local.align 8 .b8 %frame[8];
- // #APP
-// 7 "subr.c" 1
- mov.u32 %r24, %clock;
-// 0 "" 2
- // #NO_APP
- st.local.u32 [%frame], %r24;
- ld.local.u32 %r22, [%frame];
- mov.u32 %r23, %r22;
- mov.u32 %retval, %r23;
- st.param.u32 [%out_retval], %retval;
- ret;
- }
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay
-.visible .entry delay(.param.u64 %in_ar1, .param.u64 %in_ar2)
-{
- .reg.u64 %ar1;
- .reg.u64 %ar2;
- .reg.u64 %hr10;
- .reg.u64 %r22;
- .reg.u32 %r23;
- .reg.u64 %r24;
- .reg.u64 %r25;
- .reg.u32 %r26;
- .reg.u32 %r27;
- .reg.u32 %r28;
- .reg.u32 %r29;
- .reg.u32 %r30;
- .reg.u64 %r31;
- .reg.pred %r32;
- .local.align 8 .b8 %frame[24];
- ld.param.u64 %ar1, [%in_ar1];
- ld.param.u64 %ar2, [%in_ar2];
- mov.u64 %r24, %ar1;
- st.u64 [%frame+8], %r24;
- mov.u64 %r25, %ar2;
- st.local.u64 [%frame+16], %r25;
+ .visible .entry delay
{
- .param.u32 %retval_in;
- {
- call (%retval_in), clock;
- }
- ld.param.u32 %r26, [%retval_in];
-}
- st.local.u32 [%frame+4], %r26;
- mov.u32 %r27, 0;
- st.local.u32 [%frame], %r27;
- bra $L4;
-$L5:
- {
- .param.u32 %retval_in;
- {
- call (%retval_in), clock;
- }
- ld.param.u32 %r28, [%retval_in];
-}
- mov.u32 %r23, %r28;
- ld.local.u32 %r30, [%frame+4];
- sub.u32 %r29, %r23, %r30;
- st.local.u32 [%frame], %r29;
-$L4:
- ld.local.s32 %r22, [%frame];
- ld.local.u64 %r31, [%frame+16];
- setp.lo.u64 %r32,%r22,%r31;
- @%r32 bra $L5;
+ .reg .u64 %hr10;
+ .reg .u32 %r22;
+ .reg .u32 %r23;
+ .reg .u32 %r24;
+ .reg .u32 %r25;
+ .reg .u32 %r26;
+ .reg .u32 %r27;
+ .reg .u32 %r28;
+ .reg .u32 %r29;
+ .reg .pred %r30;
+ .reg .u64 %frame;
+ .local .align 8 .b8 %farray[16];
+ cvta.local.u64 %frame,%farray;
+ mov.u32 %r22,500000;
+ st.u32 [%frame+8],%r22;
+ mov.u32 %r23,0;
+ st.u32 [%frame],%r23;
+ bra $L2;
+ $L3:
+ ld.u32 %r25,[%frame+4];
+ add.u32 %r24,%r25,1;
+ st.u32 [%frame+4],%r24;
+ ld.u32 %r27,[%frame];
+ add.u32 %r26,%r27,1;
+ st.u32 [%frame],%r26;
+ $L2:
+ ld.u32 %r28,[%frame];
+ ld.u32 %r29,[%frame+8];
+ setp.lt.s32 %r30,%r28,%r29;
+ @%r30
+ bra $L3;
ret;
}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay2
-.visible .entry delay2(.param.u64 %in_ar1, .param.u64 %in_ar2, .param.u64 %in_ar3)
-{
- .reg.u64 %ar1;
- .reg.u64 %ar2;
- .reg.u64 %ar3;
- .reg.u64 %hr10;
- .reg.u64 %r22;
- .reg.u32 %r23;
- .reg.u64 %r24;
- .reg.u64 %r25;
- .reg.u64 %r26;
- .reg.u32 %r27;
- .reg.u32 %r28;
- .reg.u32 %r29;
- .reg.u32 %r30;
- .reg.u32 %r31;
- .reg.u64 %r32;
- .reg.pred %r33;
- .reg.u64 %r34;
- .reg.u64 %r35;
- .local.align 8 .b8 %frame[32];
- ld.param.u64 %ar1, [%in_ar1];
- ld.param.u64 %ar2, [%in_ar2];
- ld.param.u64 %ar3, [%in_ar3];
- mov.u64 %r24, %ar1;
- st.local.u64 [%frame+8], %r24;
- mov.u64 %r25, %ar2;
- st.local.u64 [%frame+16], %r25;
- mov.u64 %r26, %ar3;
- st.local.u64 [%frame+24], %r26;
+
+ .visible .entry delay2 (.param .u64 %in_ar1, .param .u64 %in_ar2)
{
- .param.u32 %retval_in;
- {
- call (%retval_in), clock;
- }
- ld.param.u32 %r27, [%retval_in];
-}
- st.local.u32 [%frame+4], %r27;
- mov.u32 %r28, 0;
- st.local.u32 [%frame], %r28;
- bra $L8;
-$L9:
- {
- .param.u32 %retval_in;
- {
- call (%retval_in), clock;
- }
- ld.param.u32 %r29, [%retval_in];
-}
- mov.u32 %r23, %r29;
- ld.local.u32 %r31, [%frame+4];
- sub.u32 %r30, %r23, %r31;
- st.local.u32 [%frame], %r30;
-$L8:
- ld.local.s32 %r22, [%frame];
- ld.local.u64 %r32, [%frame+16];
- setp.lo.u64 %r33,%r22,%r32;
- @%r33 bra $L9;
- ld.local.u64 %r34, [%frame+8];
- ld.local.u64 %r35, [%frame+24];
- st.u64 [%r34], %r35;
+ .reg .u64 %ar1;
+ .reg .u64 %ar2;
+ .reg .u64 %hr10;
+ .reg .u64 %r22;
+ .reg .u64 %r23;
+ .reg .u32 %r24;
+ .reg .u32 %r25;
+ .reg .u32 %r26;
+ .reg .u32 %r27;
+ .reg .u32 %r28;
+ .reg .u32 %r29;
+ .reg .u32 %r30;
+ .reg .u32 %r31;
+ .reg .pred %r32;
+ .reg .u64 %r33;
+ .reg .u64 %r34;
+ .reg .u64 %frame;
+ .local .align 8 .b8 %farray[32];
+ cvta.local.u64 %frame,%farray;
+ ld.param.u64 %ar1,[%in_ar1];
+ ld.param.u64 %ar2,[%in_ar2];
+ mov.u64 %r22,%ar1;
+ st.u64 [%frame+16],%r22;
+ mov.u64 %r23,%ar2;
+ st.u64 [%frame+24],%r23;
+ mov.u32 %r24,500000;
+ st.u32 [%frame+8],%r24;
+ mov.u32 %r25,0;
+ st.u32 [%frame],%r25;
+ bra $L5;
+ $L6:
+ ld.u32 %r27,[%frame+4];
+ add.u32 %r26,%r27,1;
+ st.u32 [%frame+4],%r26;
+ ld.u32 %r29,[%frame];
+ add.u32 %r28,%r29,1;
+ st.u32 [%frame],%r28;
+ $L5:
+ ld.u32 %r30,[%frame];
+ ld.u32 %r31,[%frame+8];
+ setp.lt.s32 %r32,%r30,%r31;
+ @%r32
+ bra $L6;
+ ld.u64 %r33,[%frame+16];
+ ld.u64 %r34,[%frame+24];
+ st.u64 [%r33],%r34;
ret;
}
-// END FUNCTION DEF
===================================================================
@@ -1,103 +0,0 @@
-
-#include <stdio.h>
-#include <cuda.h>
-
-static int _Tnum_timers;
-static CUevent *_Tstart_events, *_Tstop_events;
-static CUstream _Tstream;
-
-void
-init_timers (int ntimers)
-{
- int i;
- CUresult r;
-
- _Tnum_timers = ntimers;
-
- _Tstart_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
- _Tstop_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-
- r = cuStreamCreate (&_Tstream, CU_STREAM_DEFAULT);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuStreamCreate failed: %d\n", r);
- abort ();
- }
-
- for (i = 0; i < _Tnum_timers; i++)
- {
- r = cuEventCreate (&_Tstart_events[i], CU_EVENT_DEFAULT);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuEventCreate failed: %d\n", r);
- abort ();
- }
-
- r = cuEventCreate (&_Tstop_events[i], CU_EVENT_DEFAULT);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuEventCreate failed: %d\n", r);
- abort ();
- }
- }
-}
-
-void
-fini_timers (void)
-{
- int i;
-
- for (i = 0; i < _Tnum_timers; i++)
- {
- cuEventDestroy (_Tstart_events[i]);
- cuEventDestroy (_Tstop_events[i]);
- }
-
- cuStreamDestroy (_Tstream);
-
- free (_Tstart_events);
- free (_Tstop_events);
-}
-
-void
-start_timer (int timer)
-{
- CUresult r;
-
- r = cuEventRecord (_Tstart_events[timer], _Tstream);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuEventRecord failed: %d\n", r);
- abort ();
- }
-}
-
-float
-stop_timer (int timer)
-{
- CUresult r;
- float etime;
-
- r = cuEventRecord (_Tstop_events[timer], _Tstream);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuEventRecord failed: %d\n", r);
- abort ();
- }
-
- r = cuEventSynchronize (_Tstop_events[timer]);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuEventSynchronize failed: %d\n", r);
- abort ();
- }
-
- r = cuEventElapsedTime (&etime, _Tstart_events[timer], _Tstop_events[timer]);
- if (r != CUDA_SUCCESS)
- {
- fprintf (stderr, "cuEventElapsedTime failed: %d\n", r);
- abort ();
- }
-
- return etime;
-}
===================================================================
@@ -17,7 +17,7 @@ program asyncwait
!$acc enter data copyin (a(1:N)) copyin (b(1:N)) copyin (N) async
- !$acc parallel async wait
+ !$acc parallel async wait present (a(1:N)) present (b(1:N)) present (N)
do i = 1, N
b(i) = a(i)
end do
@@ -36,7 +36,7 @@ program asyncwait
!$acc enter data copyin (a(1:N)) copyin (b(1:N)) async (1)
- !$acc parallel async (1) wait (1)
+ !$acc parallel async (1) wait (1) present (a(1:N), b(1:N), N)
do i = 1, N
b(i) = a(i)
end do
@@ -55,21 +55,22 @@ program asyncwait
c(:) = 0.0
d(:) = 0.0
- !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N))
+ !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+ !$acc& create (d(1:N))
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), N)
do i = 1, N
b(i) = (a(i) * a(i) * a(i)) / a(i)
end do
!$acc end parallel
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), N)
do i = 1, N
c(i) = (a(i) * 4) / a(i)
end do
!$acc end parallel
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N), N)
do i = 1, N
d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
end do
@@ -76,7 +77,8 @@ program asyncwait
!$acc end parallel
!$acc wait (1)
- !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N))
+ !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+ !$acc& copyout (d(1:N))
do i = 1, N
if (a(i) .ne. 3.0) STOP 5
@@ -91,27 +93,32 @@ program asyncwait
d(:) = 0.0
e(:) = 0.0
- !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N)) copyin (e(1:N))
+ !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+ !$acc& create (d(1:N)) copyin (e(1:N))
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+ !$acc& present (e(1:N), N)
do i = 1, N
b(i) = (a(i) * a(i) * a(i)) / a(i)
end do
!$acc end parallel
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+ !$acc& present (e(1:N), N)
do i = 1, N
c(i) = (a(i) * 4) / a(i)
end do
!$acc end parallel
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+ !$acc& present (e(1:N), N)
do i = 1, N
d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
end do
!$acc end parallel
- !$acc parallel wait (1) async (1)
+ !$acc parallel wait (1) async (1) present (a(1:N), b(1:N), c(1:N)) &
+ !$acc& present (d(1:N), e(1:N), N)
do i = 1, N
e(i) = a(i) + b(i) + c(i) + d(i)
end do
@@ -118,7 +125,8 @@ program asyncwait
!$acc end parallel
!$acc wait (1)
- !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N)) copyout (e(1:N))
+ !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+ !$acc& copyout (d(1:N)) copyout (e(1:N))
!$acc exit data delete (N)
do i = 1, N
===================================================================
@@ -17,7 +17,7 @@ program asyncwait
!$acc enter data copyin (a(1:N)) copyin (b(1:N)) copyin (N) async
- !$acc parallel async wait
+ !$acc parallel async wait present (a(1:N), b(1:N), N)
!$acc loop
do i = 1, N
b(i) = a(i)
@@ -37,7 +37,7 @@ program asyncwait
!$acc update device (a(1:N), b(1:N)) async (1)
- !$acc parallel async (1) wait (1)
+ !$acc parallel async (1) wait (1) present (a(1:N), b(1:N), N)
!$acc loop
do i = 1, N
b(i) = a(i)
@@ -60,19 +60,19 @@ program asyncwait
!$acc enter data copyin (c(1:N), d(1:N)) async (1)
!$acc update device (a(1:N), b(1:N)) async (1)
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), b(1:N), N)
do i = 1, N
b(i) = (a(i) * a(i) * a(i)) / a(i)
end do
!$acc end parallel
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), c(1:N), N)
do i = 1, N
c(i) = (a(i) * 4) / a(i)
end do
!$acc end parallel
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), d(1:N), N)
do i = 1, N
d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
end do
@@ -98,25 +98,26 @@ program asyncwait
!$acc enter data copyin (e(1:N)) async (1)
!$acc update device (a(1:N), b(1:N), c(1:N), d(1:N)) async (1)
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), b(1:N), N)
do i = 1, N
b(i) = (a(i) * a(i) * a(i)) / a(i)
end do
!$acc end parallel
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), c(1:N), N)
do i = 1, N
c(i) = (a(i) * 4) / a(i)
end do
!$acc end parallel
- !$acc parallel async (1)
+ !$acc parallel async (1) present (a(1:N), d(1:N), N)
do i = 1, N
d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
end do
!$acc end parallel
- !$acc parallel wait (1) async (1)
+ !$acc parallel wait (1) async (1) present (a(1:N), b(1:N), c(1:N)) &
+ !$acc& present (d(1:N), e(1:N), N)
do i = 1, N
e(i) = a(i) + b(i) + c(i) + d(i)
end do