diff mbox

Make the OpenACC C++ acc_on_device wrapper "always inline" (was: [openacc] on_device fix)

Message ID 87h90ba8u8.fsf@euler.schwinge.homeip.net
State New
Headers show

Commit Message

Thomas Schwinge May 23, 2017, 3:31 p.m. UTC
Hi!

On Thu, 29 Oct 2015 17:22:46 -0700, Nathan Sidwell <nathan@acm.org> wrote:
> acc_on_device and it's builtin had a conflict.  The function formally takes an 
> enum argument, but the builtin takes an int -- primarily to avoid the compiler 
> having to generate the enum  type internally.
> 
> This works fine for C,  where the external declaration of the function (in 
> openacc.h) matches up with the builtin, and we optimize the builtin as expected.
> 
> It fails for C++ where the builtin doesn't match the declaration in the header. 
>   We end up with emitting a call to acc_on_device,  which is resolved by 
> libgomp.  Unfortunately that means we fail to optimize.  [...]

> [Nathan's trunk r229562] leaves things unchanged for C --  declare a function with an enum arg. 
>   But for C++ we the extern "C" declaration takes an int -- and therefore 
> matches the builtin.  We insert an inline wrapper that takes an enum argument. 
> Because of C++'s overload resolution both the wrapper and the int-taking 
> declaration can have the same source name.

> --- libgomp/openacc.h	(revision 229535)
> +++ libgomp/openacc.h	(working copy)

> -int acc_on_device (acc_device_t) __GOACC_NOTHROW;
> +#ifdef __cplusplus
> +int acc_on_device (int __arg) __GOACC_NOTHROW;
> +#else
> +int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW;
> +#endif

>  #ifdef __cplusplus
>  }
> +
> +/* Forwarding function with correctly typed arg.  */
> +
> +inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> +{
> +  return acc_on_device ((int) __arg);
> +}
>  #endif

> --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c	(revision 0)
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c	(working copy)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include <openacc.h>
> +
> +int Foo (acc_device_t x)
> +{
> +  return acc_on_device (x);
> +}
> +
> +/* { dg-final { scan-assembler-not "acc_on_device" } } */

As a user, I'd expect that when compiling such code with "-O0" instead of
"-O2", but adding "__attribute__ ((optimize ("O2")))" to "Foo", that I'd
then get "acc_on_device" expanded as a builtin, and no calls to the
"acc_on_device library function.  In C++ that is currently not working,
because the "Forwarding function with correctly typed arg" (cited above)
doesn't "inherit" that "optimize" attribute.  Making that one "always
inline" resolves the problem.  Also I cleaned up and extended testing
some more.  OK for trunk?

commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
Author: Thomas Schwinge <thomas@codesourcery.com>
Date:   Tue May 23 13:21:14 2017 +0200

    Make the OpenACC C++ acc_on_device wrapper "always inline"
    
            libgomp/
            * openacc.h [__cplusplus] (acc_on_device): Mark as "always
            inline".
            * testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
            file; test cases already present...
            * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
            this file.  Update.
            * testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
            file; test cases now present...
            * testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
            this new file.
            * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.
---
 libgomp/openacc.h                                  |  3 +-
 .../libgomp.oacc-c-c++-common/acc-on-device-2.c    | 22 -------------
 .../libgomp.oacc-c-c++-common/acc-on-device.c      | 12 -------
 .../libgomp.oacc-c-c++-common/acc_on_device-1.c    | 38 +++++++++++++---------
 .../libgomp.oacc-c-c++-common/acc_on_device-2.c    | 21 ++++++++++++
 .../libgomp.oacc-c-c++-common/parallel-dims.c      | 14 ++++----
 6 files changed, 52 insertions(+), 58 deletions(-)



Grüße
 Thomas

Comments

Thomas Schwinge May 30, 2017, 12:35 p.m. UTC | #1
Hi!

Ping.

On Tue, 23 May 2017 17:31:11 +0200, I wrote:
> On Thu, 29 Oct 2015 17:22:46 -0700, Nathan Sidwell <nathan@acm.org> wrote:
> > acc_on_device and it's builtin had a conflict.  The function formally takes an 
> > enum argument, but the builtin takes an int -- primarily to avoid the compiler 
> > having to generate the enum  type internally.
> > 
> > This works fine for C,  where the external declaration of the function (in 
> > openacc.h) matches up with the builtin, and we optimize the builtin as expected.
> > 
> > It fails for C++ where the builtin doesn't match the declaration in the header. 
> >   We end up with emitting a call to acc_on_device,  which is resolved by 
> > libgomp.  Unfortunately that means we fail to optimize.  [...]
> 
> > [Nathan's trunk r229562] leaves things unchanged for C --  declare a function with an enum arg. 
> >   But for C++ we the extern "C" declaration takes an int -- and therefore 
> > matches the builtin.  We insert an inline wrapper that takes an enum argument. 
> > Because of C++'s overload resolution both the wrapper and the int-taking 
> > declaration can have the same source name.
> 
> > --- libgomp/openacc.h	(revision 229535)
> > +++ libgomp/openacc.h	(working copy)
> 
> > -int acc_on_device (acc_device_t) __GOACC_NOTHROW;
> > +#ifdef __cplusplus
> > +int acc_on_device (int __arg) __GOACC_NOTHROW;
> > +#else
> > +int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW;
> > +#endif
> 
> >  #ifdef __cplusplus
> >  }
> > +
> > +/* Forwarding function with correctly typed arg.  */
> > +
> > +inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> > +{
> > +  return acc_on_device ((int) __arg);
> > +}
> >  #endif
> 
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c	(revision 0)
> > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c	(working copy)
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O2" } */
> > +
> > +#include <openacc.h>
> > +
> > +int Foo (acc_device_t x)
> > +{
> > +  return acc_on_device (x);
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "acc_on_device" } } */
> 
> As a user, I'd expect that when compiling such code with "-O0" instead of
> "-O2", but adding "__attribute__ ((optimize ("O2")))" to "Foo", that I'd
> then get "acc_on_device" expanded as a builtin, and no calls to the
> "acc_on_device library function.  In C++ that is currently not working,
> because the "Forwarding function with correctly typed arg" (cited above)
> doesn't "inherit" that "optimize" attribute.  Making that one "always
> inline" resolves the problem.  Also I cleaned up and extended testing
> some more.  OK for trunk?
> 
> commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
> Author: Thomas Schwinge <thomas@codesourcery.com>
> Date:   Tue May 23 13:21:14 2017 +0200
> 
>     Make the OpenACC C++ acc_on_device wrapper "always inline"
>     
>             libgomp/
>             * openacc.h [__cplusplus] (acc_on_device): Mark as "always
>             inline".
>             * testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
>             file; test cases already present...
>             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
>             this file.  Update.
>             * testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
>             file; test cases now present...
>             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
>             this new file.
>             * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.
> ---
>  libgomp/openacc.h                                  |  3 +-
>  .../libgomp.oacc-c-c++-common/acc-on-device-2.c    | 22 -------------
>  .../libgomp.oacc-c-c++-common/acc-on-device.c      | 12 -------
>  .../libgomp.oacc-c-c++-common/acc_on_device-1.c    | 38 +++++++++++++---------
>  .../libgomp.oacc-c-c++-common/acc_on_device-2.c    | 21 ++++++++++++
>  .../libgomp.oacc-c-c++-common/parallel-dims.c      | 14 ++++----
>  6 files changed, 52 insertions(+), 58 deletions(-)
> 
> diff --git libgomp/openacc.h libgomp/openacc.h
> index 137e2c1..266f559 100644
> --- libgomp/openacc.h
> +++ libgomp/openacc.h
> @@ -121,7 +121,8 @@ int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW;
>  /* Forwarding function with correctly typed arg.  */
>  
>  #pragma acc routine seq
> -inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> +inline __attribute__ ((__always_inline__)) int
> +acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
>  {
>    return acc_on_device ((int) __arg);
>  }
> diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
> deleted file mode 100644
> index bfcb67d..0000000
> --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
> +++ /dev/null
> @@ -1,22 +0,0 @@
> -/* Test the acc_on_device library function. */
> -/* { dg-additional-options "-fno-builtin-acc_on_device" } */
> -
> -#include <openacc.h>
> -
> -int main ()
> -{
> -  int dev;
> -  
> -#pragma acc parallel copyout (dev)
> -  {
> -    dev = acc_on_device (acc_device_not_host);
> -  }
> -
> -  int expect = 1;
> -  
> -#if  ACC_DEVICE_TYPE_host
> -  expect = 0;
> -#endif
> -  
> -  return dev != expect;
> -}
> diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
> deleted file mode 100644
> index e0d8710..0000000
> --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
> +++ /dev/null
> @@ -1,12 +0,0 @@
> -/* { dg-do compile } */
> -/* We don't expect this to work with optimizations disabled.
> -   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
> -
> -#include <openacc.h>
> -
> -int Foo (acc_device_t x)
> -{
> -  return acc_on_device (x);
> -}
> -
> -/* { dg-final { scan-assembler-not "acc_on_device" } } */
> diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> index 8112745..eb962e4 100644
> --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> @@ -1,6 +1,9 @@
>  /* Disable the acc_on_device builtin; we want to test the libgomp library
>     function.  */
> +/* { dg-additional-options "-DACC_ON_DEVICE=acc_on_device" } */
>  /* { dg-additional-options "-fno-builtin-acc_on_device" } */
> +/* { dg-additional-options "-fdump-rtl-expand" }
> +   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 12 "expand" } } */
>  
>  #include <stdlib.h>
>  #include <openacc.h>
> @@ -11,13 +14,13 @@ main (int argc, char *argv[])
>    /* Host.  */
>  
>    {
> -    if (!acc_on_device (acc_device_none))
> +    if (!ACC_ON_DEVICE (acc_device_none))
>        abort ();
> -    if (!acc_on_device (acc_device_host))
> +    if (!ACC_ON_DEVICE (acc_device_host))
>        abort ();
> -    if (acc_on_device (acc_device_not_host))
> +    if (ACC_ON_DEVICE (acc_device_not_host))
>        abort ();
> -    if (acc_on_device (acc_device_nvidia))
> +    if (ACC_ON_DEVICE (acc_device_nvidia))
>        abort ();
>    }
>  
> @@ -26,39 +29,44 @@ main (int argc, char *argv[])
>  
>  #pragma acc parallel if(0)
>    {
> -    if (!acc_on_device (acc_device_none))
> +    if (!ACC_ON_DEVICE (acc_device_none))
>        abort ();
> -    if (!acc_on_device (acc_device_host))
> +    if (!ACC_ON_DEVICE (acc_device_host))
>        abort ();
> -    if (acc_on_device (acc_device_not_host))
> +    if (ACC_ON_DEVICE (acc_device_not_host))
>        abort ();
> -    if (acc_on_device (acc_device_nvidia))
> +    if (ACC_ON_DEVICE (acc_device_nvidia))
>        abort ();
>    }
>  
>  
> -#if !ACC_DEVICE_TYPE_host
> +  int on_host_p;
> +#if ACC_DEVICE_TYPE_host
> +  on_host_p = 1;
> +#else
> +  on_host_p = 0;
> +#endif
>  
>    /* Offloaded.  */
>  
>  #pragma acc parallel
>    {
> -    if (acc_on_device (acc_device_none))
> +    if (on_host_p != ACC_ON_DEVICE (acc_device_none))
>        abort ();
> -    if (acc_on_device (acc_device_host))
> +    if (on_host_p != ACC_ON_DEVICE (acc_device_host))
>        abort ();
> -    if (!acc_on_device (acc_device_not_host))
> +    if (on_host_p == ACC_ON_DEVICE (acc_device_not_host))
>        abort ();
> +
>  #if ACC_DEVICE_TYPE_nvidia
> -    if (!acc_on_device (acc_device_nvidia))
> +    if (!ACC_ON_DEVICE (acc_device_nvidia))
>        abort ();
>  #else
> -    if (acc_on_device (acc_device_nvidia))
> +    if (ACC_ON_DEVICE (acc_device_nvidia))
>        abort ();
>  #endif
>    }
>  
> -#endif
>  
>    return 0;
>  }
> diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
> new file mode 100644
> index 0000000..c3b3378
> --- /dev/null
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
> @@ -0,0 +1,21 @@
> +/* With the acc_on_device builtin enabled, we don't expect any calls to the
> +   libgomp library function.  */
> +/* { dg-additional-options "-fdump-rtl-expand" }
> +   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 0 "expand" } } */
> +
> +#include <openacc.h>
> +
> +#ifdef __OPTIMIZE__
> +# define ACC_ON_DEVICE acc_on_device
> +#else
> +/* Without optimizations enabled, we're not expecting the acc_on_device builtin
> +   to be used, so use here a "-O2" wrapper.  */
> +#pragma acc routine seq
> +static int __attribute__ ((optimize ("O2")))
> +ACC_ON_DEVICE (acc_device_t arg)
> +{
> +  return acc_on_device (arg);
> +}
> +#endif
> +
> +#include "acc_on_device-1.c"
> diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> index 8308f7c..1c48ab3 100644
> --- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> @@ -4,14 +4,12 @@
>  #include <limits.h>
>  #include <openacc.h>
>  
> -/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
> -   not behaving as expected for -O0.  */
>  #pragma acc routine seq
>  static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
>  {
> -  if (acc_on_device ((int) acc_device_host))
> +  if (acc_on_device (acc_device_host))
>      return 0;
> -  else if (acc_on_device ((int) acc_device_nvidia))
> +  else if (acc_on_device (acc_device_nvidia))
>      {
>        unsigned int r;
>        asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
> @@ -24,9 +22,9 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
>  #pragma acc routine seq
>  static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
>  {
> -  if (acc_on_device ((int) acc_device_host))
> +  if (acc_on_device (acc_device_host))
>      return 0;
> -  else if (acc_on_device ((int) acc_device_nvidia))
> +  else if (acc_on_device (acc_device_nvidia))
>      {
>        unsigned int r;
>        asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
> @@ -39,9 +37,9 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
>  #pragma acc routine seq
>  static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
>  {
> -  if (acc_on_device ((int) acc_device_host))
> +  if (acc_on_device (acc_device_host))
>      return 0;
> -  else if (acc_on_device ((int) acc_device_nvidia))
> +  else if (acc_on_device (acc_device_nvidia))
>      {
>        unsigned int r;
>        asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));


Grüße
 Thomas
Thomas Schwinge June 6, 2017, 6:35 a.m. UTC | #2
Hi!

Ping.

On Tue, 30 May 2017 14:35:29 +0200, I wrote:
> Ping.
> 
> On Tue, 23 May 2017 17:31:11 +0200, I wrote:
> > On Thu, 29 Oct 2015 17:22:46 -0700, Nathan Sidwell <nathan@acm.org> wrote:
> > > acc_on_device and it's builtin had a conflict.  The function formally takes an 
> > > enum argument, but the builtin takes an int -- primarily to avoid the compiler 
> > > having to generate the enum  type internally.
> > > 
> > > This works fine for C,  where the external declaration of the function (in 
> > > openacc.h) matches up with the builtin, and we optimize the builtin as expected.
> > > 
> > > It fails for C++ where the builtin doesn't match the declaration in the header. 
> > >   We end up with emitting a call to acc_on_device,  which is resolved by 
> > > libgomp.  Unfortunately that means we fail to optimize.  [...]
> > 
> > > [Nathan's trunk r229562] leaves things unchanged for C --  declare a function with an enum arg. 
> > >   But for C++ we the extern "C" declaration takes an int -- and therefore 
> > > matches the builtin.  We insert an inline wrapper that takes an enum argument. 
> > > Because of C++'s overload resolution both the wrapper and the int-taking 
> > > declaration can have the same source name.
> > 
> > > --- libgomp/openacc.h	(revision 229535)
> > > +++ libgomp/openacc.h	(working copy)
> > 
> > > -int acc_on_device (acc_device_t) __GOACC_NOTHROW;
> > > +#ifdef __cplusplus
> > > +int acc_on_device (int __arg) __GOACC_NOTHROW;
> > > +#else
> > > +int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW;
> > > +#endif
> > 
> > >  #ifdef __cplusplus
> > >  }
> > > +
> > > +/* Forwarding function with correctly typed arg.  */
> > > +
> > > +inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> > > +{
> > > +  return acc_on_device ((int) __arg);
> > > +}
> > >  #endif
> > 
> > > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c	(revision 0)
> > > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c	(working copy)
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O2" } */
> > > +
> > > +#include <openacc.h>
> > > +
> > > +int Foo (acc_device_t x)
> > > +{
> > > +  return acc_on_device (x);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-not "acc_on_device" } } */
> > 
> > As a user, I'd expect that when compiling such code with "-O0" instead of
> > "-O2", but adding "__attribute__ ((optimize ("O2")))" to "Foo", that I'd
> > then get "acc_on_device" expanded as a builtin, and no calls to the
> > "acc_on_device library function.  In C++ that is currently not working,
> > because the "Forwarding function with correctly typed arg" (cited above)
> > doesn't "inherit" that "optimize" attribute.  Making that one "always
> > inline" resolves the problem.  Also I cleaned up and extended testing
> > some more.  OK for trunk?
> > 
> > commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
> > Author: Thomas Schwinge <thomas@codesourcery.com>
> > Date:   Tue May 23 13:21:14 2017 +0200
> > 
> >     Make the OpenACC C++ acc_on_device wrapper "always inline"
> >     
> >             libgomp/
> >             * openacc.h [__cplusplus] (acc_on_device): Mark as "always
> >             inline".
> >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
> >             file; test cases already present...
> >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
> >             this file.  Update.
> >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
> >             file; test cases now present...
> >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
> >             this new file.
> >             * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.
> > ---
> >  libgomp/openacc.h                                  |  3 +-
> >  .../libgomp.oacc-c-c++-common/acc-on-device-2.c    | 22 -------------
> >  .../libgomp.oacc-c-c++-common/acc-on-device.c      | 12 -------
> >  .../libgomp.oacc-c-c++-common/acc_on_device-1.c    | 38 +++++++++++++---------
> >  .../libgomp.oacc-c-c++-common/acc_on_device-2.c    | 21 ++++++++++++
> >  .../libgomp.oacc-c-c++-common/parallel-dims.c      | 14 ++++----
> >  6 files changed, 52 insertions(+), 58 deletions(-)
> > 
> > diff --git libgomp/openacc.h libgomp/openacc.h
> > index 137e2c1..266f559 100644
> > --- libgomp/openacc.h
> > +++ libgomp/openacc.h
> > @@ -121,7 +121,8 @@ int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW;
> >  /* Forwarding function with correctly typed arg.  */
> >  
> >  #pragma acc routine seq
> > -inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> > +inline __attribute__ ((__always_inline__)) int
> > +acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> >  {
> >    return acc_on_device ((int) __arg);
> >  }
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
> > deleted file mode 100644
> > index bfcb67d..0000000
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
> > +++ /dev/null
> > @@ -1,22 +0,0 @@
> > -/* Test the acc_on_device library function. */
> > -/* { dg-additional-options "-fno-builtin-acc_on_device" } */
> > -
> > -#include <openacc.h>
> > -
> > -int main ()
> > -{
> > -  int dev;
> > -  
> > -#pragma acc parallel copyout (dev)
> > -  {
> > -    dev = acc_on_device (acc_device_not_host);
> > -  }
> > -
> > -  int expect = 1;
> > -  
> > -#if  ACC_DEVICE_TYPE_host
> > -  expect = 0;
> > -#endif
> > -  
> > -  return dev != expect;
> > -}
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
> > deleted file mode 100644
> > index e0d8710..0000000
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
> > +++ /dev/null
> > @@ -1,12 +0,0 @@
> > -/* { dg-do compile } */
> > -/* We don't expect this to work with optimizations disabled.
> > -   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
> > -
> > -#include <openacc.h>
> > -
> > -int Foo (acc_device_t x)
> > -{
> > -  return acc_on_device (x);
> > -}
> > -
> > -/* { dg-final { scan-assembler-not "acc_on_device" } } */
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> > index 8112745..eb962e4 100644
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> > @@ -1,6 +1,9 @@
> >  /* Disable the acc_on_device builtin; we want to test the libgomp library
> >     function.  */
> > +/* { dg-additional-options "-DACC_ON_DEVICE=acc_on_device" } */
> >  /* { dg-additional-options "-fno-builtin-acc_on_device" } */
> > +/* { dg-additional-options "-fdump-rtl-expand" }
> > +   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 12 "expand" } } */
> >  
> >  #include <stdlib.h>
> >  #include <openacc.h>
> > @@ -11,13 +14,13 @@ main (int argc, char *argv[])
> >    /* Host.  */
> >  
> >    {
> > -    if (!acc_on_device (acc_device_none))
> > +    if (!ACC_ON_DEVICE (acc_device_none))
> >        abort ();
> > -    if (!acc_on_device (acc_device_host))
> > +    if (!ACC_ON_DEVICE (acc_device_host))
> >        abort ();
> > -    if (acc_on_device (acc_device_not_host))
> > +    if (ACC_ON_DEVICE (acc_device_not_host))
> >        abort ();
> > -    if (acc_on_device (acc_device_nvidia))
> > +    if (ACC_ON_DEVICE (acc_device_nvidia))
> >        abort ();
> >    }
> >  
> > @@ -26,39 +29,44 @@ main (int argc, char *argv[])
> >  
> >  #pragma acc parallel if(0)
> >    {
> > -    if (!acc_on_device (acc_device_none))
> > +    if (!ACC_ON_DEVICE (acc_device_none))
> >        abort ();
> > -    if (!acc_on_device (acc_device_host))
> > +    if (!ACC_ON_DEVICE (acc_device_host))
> >        abort ();
> > -    if (acc_on_device (acc_device_not_host))
> > +    if (ACC_ON_DEVICE (acc_device_not_host))
> >        abort ();
> > -    if (acc_on_device (acc_device_nvidia))
> > +    if (ACC_ON_DEVICE (acc_device_nvidia))
> >        abort ();
> >    }
> >  
> >  
> > -#if !ACC_DEVICE_TYPE_host
> > +  int on_host_p;
> > +#if ACC_DEVICE_TYPE_host
> > +  on_host_p = 1;
> > +#else
> > +  on_host_p = 0;
> > +#endif
> >  
> >    /* Offloaded.  */
> >  
> >  #pragma acc parallel
> >    {
> > -    if (acc_on_device (acc_device_none))
> > +    if (on_host_p != ACC_ON_DEVICE (acc_device_none))
> >        abort ();
> > -    if (acc_on_device (acc_device_host))
> > +    if (on_host_p != ACC_ON_DEVICE (acc_device_host))
> >        abort ();
> > -    if (!acc_on_device (acc_device_not_host))
> > +    if (on_host_p == ACC_ON_DEVICE (acc_device_not_host))
> >        abort ();
> > +
> >  #if ACC_DEVICE_TYPE_nvidia
> > -    if (!acc_on_device (acc_device_nvidia))
> > +    if (!ACC_ON_DEVICE (acc_device_nvidia))
> >        abort ();
> >  #else
> > -    if (acc_on_device (acc_device_nvidia))
> > +    if (ACC_ON_DEVICE (acc_device_nvidia))
> >        abort ();
> >  #endif
> >    }
> >  
> > -#endif
> >  
> >    return 0;
> >  }
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
> > new file mode 100644
> > index 0000000..c3b3378
> > --- /dev/null
> > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
> > @@ -0,0 +1,21 @@
> > +/* With the acc_on_device builtin enabled, we don't expect any calls to the
> > +   libgomp library function.  */
> > +/* { dg-additional-options "-fdump-rtl-expand" }
> > +   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 0 "expand" } } */
> > +
> > +#include <openacc.h>
> > +
> > +#ifdef __OPTIMIZE__
> > +# define ACC_ON_DEVICE acc_on_device
> > +#else
> > +/* Without optimizations enabled, we're not expecting the acc_on_device builtin
> > +   to be used, so use here a "-O2" wrapper.  */
> > +#pragma acc routine seq
> > +static int __attribute__ ((optimize ("O2")))
> > +ACC_ON_DEVICE (acc_device_t arg)
> > +{
> > +  return acc_on_device (arg);
> > +}
> > +#endif
> > +
> > +#include "acc_on_device-1.c"
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> > index 8308f7c..1c48ab3 100644
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> > @@ -4,14 +4,12 @@
> >  #include <limits.h>
> >  #include <openacc.h>
> >  
> > -/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
> > -   not behaving as expected for -O0.  */
> >  #pragma acc routine seq
> >  static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
> >  {
> > -  if (acc_on_device ((int) acc_device_host))
> > +  if (acc_on_device (acc_device_host))
> >      return 0;
> > -  else if (acc_on_device ((int) acc_device_nvidia))
> > +  else if (acc_on_device (acc_device_nvidia))
> >      {
> >        unsigned int r;
> >        asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
> > @@ -24,9 +22,9 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
> >  #pragma acc routine seq
> >  static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
> >  {
> > -  if (acc_on_device ((int) acc_device_host))
> > +  if (acc_on_device (acc_device_host))
> >      return 0;
> > -  else if (acc_on_device ((int) acc_device_nvidia))
> > +  else if (acc_on_device (acc_device_nvidia))
> >      {
> >        unsigned int r;
> >        asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
> > @@ -39,9 +37,9 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
> >  #pragma acc routine seq
> >  static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
> >  {
> > -  if (acc_on_device ((int) acc_device_host))
> > +  if (acc_on_device (acc_device_host))
> >      return 0;
> > -  else if (acc_on_device ((int) acc_device_nvidia))
> > +  else if (acc_on_device (acc_device_nvidia))
> >      {
> >        unsigned int r;
> >        asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));


Grüße
 Thomas
Jakub Jelinek June 6, 2017, 6:58 a.m. UTC | #3
On Tue, Jun 06, 2017 at 08:35:40AM +0200, Thomas Schwinge wrote:
> > > commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
> > > Author: Thomas Schwinge <thomas@codesourcery.com>
> > > Date:   Tue May 23 13:21:14 2017 +0200
> > > 
> > >     Make the OpenACC C++ acc_on_device wrapper "always inline"
> > >     
> > >             libgomp/
> > >             * openacc.h [__cplusplus] (acc_on_device): Mark as "always
> > >             inline".
> > >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
> > >             file; test cases already present...
> > >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
> > >             this file.  Update.
> > >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
> > >             file; test cases now present...
> > >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
> > >             this new file.
> > >             * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.

I don't like this very much.
Can't you instead just turn the builtin into BT_FN_INT_VAR and diagnose
during folding if it has no or 2+ arguments or if the argument is not type
compatible with int?

	Jakub
Thomas Schwinge June 6, 2017, 11:16 a.m. UTC | #4
Hi Jakub!

On Tue, 6 Jun 2017 08:58:21 +0200, Jakub Jelinek <jakub@redhat.com> wrote:
> On Tue, Jun 06, 2017 at 08:35:40AM +0200, Thomas Schwinge wrote:
> > > > commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
> > > > Author: Thomas Schwinge <thomas@codesourcery.com>
> > > > Date:   Tue May 23 13:21:14 2017 +0200
> > > > 
> > > >     Make the OpenACC C++ acc_on_device wrapper "always inline"
> > > >     
> > > >             libgomp/
> > > >             * openacc.h [__cplusplus] (acc_on_device): Mark as "always
> > > >             inline".
> > > >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
> > > >             file; test cases already present...
> > > >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
> > > >             this file.  Update.
> > > >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
> > > >             file; test cases now present...
> > > >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
> > > >             this new file.
> > > >             * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.
> 
> I don't like this very much.

Thanks for having a look.  Would you please clarify whether "this"
applies to my "always inline" changes and testing additions that you
quoted, or rather to the C++ "acc_on_device" wrapper function as it is
currently present?

> Can't you instead just turn the builtin into BT_FN_INT_VAR and diagnose
> during folding if it has no or 2+ arguments or if the argument is not type
> compatible with int?

Thanks for the suggestion, I'll look into that!

In terms of incremental progress, do you oppose that I commit my existing
patch now, and then rework the builtin in a later patch?


Grüße
 Thomas
Jakub Jelinek June 6, 2017, 11:20 a.m. UTC | #5
On Tue, Jun 06, 2017 at 01:16:03PM +0200, Thomas Schwinge wrote:
> On Tue, 6 Jun 2017 08:58:21 +0200, Jakub Jelinek <jakub@redhat.com> wrote:
> > On Tue, Jun 06, 2017 at 08:35:40AM +0200, Thomas Schwinge wrote:
> > > > > commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
> > > > > Author: Thomas Schwinge <thomas@codesourcery.com>
> > > > > Date:   Tue May 23 13:21:14 2017 +0200
> > > > > 
> > > > >     Make the OpenACC C++ acc_on_device wrapper "always inline"
> > > > >     
> > > > >             libgomp/
> > > > >             * openacc.h [__cplusplus] (acc_on_device): Mark as "always
> > > > >             inline".
> > > > >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
> > > > >             file; test cases already present...
> > > > >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
> > > > >             this file.  Update.
> > > > >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
> > > > >             file; test cases now present...
> > > > >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
> > > > >             this new file.
> > > > >             * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.
> > 
> > I don't like this very much.
> 
> Thanks for having a look.  Would you please clarify whether "this"
> applies to my "always inline" changes and testing additions that you
> quoted, or rather to the C++ "acc_on_device" wrapper function as it is
> currently present?

The C++ acc_on_device wrapper altogether, though of course always inline on
it doesn't sound right either (what if you want to take acc_on_device
address?).

> > Can't you instead just turn the builtin into BT_FN_INT_VAR and diagnose
> > during folding if it has no or 2+ arguments or if the argument is not type
> > compatible with int?
> 
> Thanks for the suggestion, I'll look into that!
> 
> In terms of incremental progress, do you oppose that I commit my existing
> patch now, and then rework the builtin in a later patch?

We are in stage1 and this doesn't seem to be a blocker, I think it is better
to do it right, no need to do it incrementally.

	Jakub
diff mbox

Patch

diff --git libgomp/openacc.h libgomp/openacc.h
index 137e2c1..266f559 100644
--- libgomp/openacc.h
+++ libgomp/openacc.h
@@ -121,7 +121,8 @@  int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW;
 /* Forwarding function with correctly typed arg.  */
 
 #pragma acc routine seq
-inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
+inline __attribute__ ((__always_inline__)) int
+acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
 {
   return acc_on_device ((int) __arg);
 }
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
deleted file mode 100644
index bfcb67d..0000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
+++ /dev/null
@@ -1,22 +0,0 @@ 
-/* Test the acc_on_device library function. */
-/* { dg-additional-options "-fno-builtin-acc_on_device" } */
-
-#include <openacc.h>
-
-int main ()
-{
-  int dev;
-  
-#pragma acc parallel copyout (dev)
-  {
-    dev = acc_on_device (acc_device_not_host);
-  }
-
-  int expect = 1;
-  
-#if  ACC_DEVICE_TYPE_host
-  expect = 0;
-#endif
-  
-  return dev != expect;
-}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
deleted file mode 100644
index e0d8710..0000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
+++ /dev/null
@@ -1,12 +0,0 @@ 
-/* { dg-do compile } */
-/* We don't expect this to work with optimizations disabled.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
-#include <openacc.h>
-
-int Foo (acc_device_t x)
-{
-  return acc_on_device (x);
-}
-
-/* { dg-final { scan-assembler-not "acc_on_device" } } */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
index 8112745..eb962e4 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
@@ -1,6 +1,9 @@ 
 /* Disable the acc_on_device builtin; we want to test the libgomp library
    function.  */
+/* { dg-additional-options "-DACC_ON_DEVICE=acc_on_device" } */
 /* { dg-additional-options "-fno-builtin-acc_on_device" } */
+/* { dg-additional-options "-fdump-rtl-expand" }
+   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 12 "expand" } } */
 
 #include <stdlib.h>
 #include <openacc.h>
@@ -11,13 +14,13 @@  main (int argc, char *argv[])
   /* Host.  */
 
   {
-    if (!acc_on_device (acc_device_none))
+    if (!ACC_ON_DEVICE (acc_device_none))
       abort ();
-    if (!acc_on_device (acc_device_host))
+    if (!ACC_ON_DEVICE (acc_device_host))
       abort ();
-    if (acc_on_device (acc_device_not_host))
+    if (ACC_ON_DEVICE (acc_device_not_host))
       abort ();
-    if (acc_on_device (acc_device_nvidia))
+    if (ACC_ON_DEVICE (acc_device_nvidia))
       abort ();
   }
 
@@ -26,39 +29,44 @@  main (int argc, char *argv[])
 
 #pragma acc parallel if(0)
   {
-    if (!acc_on_device (acc_device_none))
+    if (!ACC_ON_DEVICE (acc_device_none))
       abort ();
-    if (!acc_on_device (acc_device_host))
+    if (!ACC_ON_DEVICE (acc_device_host))
       abort ();
-    if (acc_on_device (acc_device_not_host))
+    if (ACC_ON_DEVICE (acc_device_not_host))
       abort ();
-    if (acc_on_device (acc_device_nvidia))
+    if (ACC_ON_DEVICE (acc_device_nvidia))
       abort ();
   }
 
 
-#if !ACC_DEVICE_TYPE_host
+  int on_host_p;
+#if ACC_DEVICE_TYPE_host
+  on_host_p = 1;
+#else
+  on_host_p = 0;
+#endif
 
   /* Offloaded.  */
 
 #pragma acc parallel
   {
-    if (acc_on_device (acc_device_none))
+    if (on_host_p != ACC_ON_DEVICE (acc_device_none))
       abort ();
-    if (acc_on_device (acc_device_host))
+    if (on_host_p != ACC_ON_DEVICE (acc_device_host))
       abort ();
-    if (!acc_on_device (acc_device_not_host))
+    if (on_host_p == ACC_ON_DEVICE (acc_device_not_host))
       abort ();
+
 #if ACC_DEVICE_TYPE_nvidia
-    if (!acc_on_device (acc_device_nvidia))
+    if (!ACC_ON_DEVICE (acc_device_nvidia))
       abort ();
 #else
-    if (acc_on_device (acc_device_nvidia))
+    if (ACC_ON_DEVICE (acc_device_nvidia))
       abort ();
 #endif
   }
 
-#endif
 
   return 0;
 }
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
new file mode 100644
index 0000000..c3b3378
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
@@ -0,0 +1,21 @@ 
+/* With the acc_on_device builtin enabled, we don't expect any calls to the
+   libgomp library function.  */
+/* { dg-additional-options "-fdump-rtl-expand" }
+   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 0 "expand" } } */
+
+#include <openacc.h>
+
+#ifdef __OPTIMIZE__
+# define ACC_ON_DEVICE acc_on_device
+#else
+/* Without optimizations enabled, we're not expecting the acc_on_device builtin
+   to be used, so use here a "-O2" wrapper.  */
+#pragma acc routine seq
+static int __attribute__ ((optimize ("O2")))
+ACC_ON_DEVICE (acc_device_t arg)
+{
+  return acc_on_device (arg);
+}
+#endif
+
+#include "acc_on_device-1.c"
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index 8308f7c..1c48ab3 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -4,14 +4,12 @@ 
 #include <limits.h>
 #include <openacc.h>
 
-/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
-   not behaving as expected for -O0.  */
 #pragma acc routine seq
 static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
 {
-  if (acc_on_device ((int) acc_device_host))
+  if (acc_on_device (acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device (acc_device_nvidia))
     {
       unsigned int r;
       asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
@@ -24,9 +22,9 @@  static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
 #pragma acc routine seq
 static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
 {
-  if (acc_on_device ((int) acc_device_host))
+  if (acc_on_device (acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device (acc_device_nvidia))
     {
       unsigned int r;
       asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
@@ -39,9 +37,9 @@  static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
 #pragma acc routine seq
 static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
 {
-  if (acc_on_device ((int) acc_device_host))
+  if (acc_on_device (acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device (acc_device_nvidia))
     {
       unsigned int r;
       asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));