diff mbox series

[aarch64,libstdc++] Use shufflevector instead of shuffle in opt_random.h

Message ID c911a45e-5924-4a4b-9b6b-bb3af0cc7ee0@nvidia.com
State New
Headers show
Series [aarch64,libstdc++] Use shufflevector instead of shuffle in opt_random.h | expand

Commit Message

Ricardo Jesus Oct. 9, 2024, 9:41 a.m. UTC
This patch modifies the implementation of the vectorized Mersenne
Twister random number generator to use __builtin_shufflevector instead
of __builtin_shuffle. This makes it (almost) compatible with Clang.

To make the implementation fully compatible with Clang, Clang will need
to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
currently it does not. This looks like an oversight in Clang and so will
be addressed separately.

I see no codegen change with this patch.

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Ricardo Jesus <rjj@nvidia.com>

2024-09-05  Ricardo Jesus  <rjj@nvidia.com>

	* config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
	of __builtin_shuffle with __builtin_shufflevector.
	(__aarch64_lsl_128): Move shift amount to a template parameter.
	(__aarch64_lsr_128): Move shift amount to a template parameter.
	(__aarch64_recursion): Update call sites of __aarch64_lsl_128
	and __aarch64_lsr_128.
---
  .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
  1 file changed, 16 insertions(+), 12 deletions(-)

Comments

Jonathan Wakely Oct. 11, 2024, 3:51 p.m. UTC | #1
On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus <rjj@nvidia.com> wrote:
>
> This patch modifies the implementation of the vectorized Mersenne
> Twister random number generator to use __builtin_shufflevector instead
> of __builtin_shuffle. This makes it (almost) compatible with Clang.
>
> To make the implementation fully compatible with Clang, Clang will need
> to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> currently it does not. This looks like an oversight in Clang and so will
> be addressed separately.
>
> I see no codegen change with this patch.

I'm not qualified to review this myself, but I'd at least like to see
the CI checks passing:
https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7ee0@nvidia.com/
Apparently the patch couldn't be applied.

Please configure your email client (thunderbird?) to not munge the
patch, or attach it rather than sending inline. Or just use
git-send-email :-)


>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Signed-off-by: Ricardo Jesus <rjj@nvidia.com>
>
> 2024-09-05  Ricardo Jesus  <rjj@nvidia.com>
>
>         * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
>         of __builtin_shuffle with __builtin_shufflevector.
>         (__aarch64_lsl_128): Move shift amount to a template parameter.
>         (__aarch64_lsr_128): Move shift amount to a template parameter.
>         (__aarch64_recursion): Update call sites of __aarch64_lsl_128
>         and __aarch64_lsr_128.
> ---
>   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
>   1 file changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> index 7f756d1572f..7eb816abcd0 100644
> --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> @@ -35,13 +35,13 @@
>   #ifdef __ARM_NEON
>
>   #ifdef __ARM_BIG_ENDIAN
> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> -    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> -     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> +    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> +    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
>   #else
> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> -    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> -     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> +    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> +    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
>   #endif
>
>   #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>     namespace {
>       // Logical Shift right 128-bits by c * 8 bits
>
> -    __extension__ extern __inline __Uint32x4_t
> +    __extension__
> +    template<int __c>
> +    extern __inline __Uint32x4_t
>       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> +    __aarch64_lsr_128 (__Uint8x16_t __a)
>       {
>         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>                                    0, 0, 0, 0, 0, 0, 0, 0};
> @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>
>       // Logical Shift left 128-bits by c * 8 bits
>
> -    __extension__ extern __inline __Uint32x4_t
> +    __extension__
> +    template<int __c>
> +    extern __inline __Uint32x4_t
>       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> +    __aarch64_lsl_128 (__Uint8x16_t __a)
>       {
>         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>                                    0, 0, 0, 0, 0, 0, 0, 0};
> @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>                                                __Uint32x4_t __e)
>       {
>         __Uint32x4_t __y = (__b >> __sr1);
> -      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> +      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
>
>         __Uint32x4_t __v = __d << __sl1;
>
>         __z = __z ^ __a;
>         __z = __z ^ __v;
>
> -      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
> +      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
>
>         __y = __y & __e;
>         __z = __z ^ __x;
> --
> 2.44.0
>
Christophe Lyon Oct. 11, 2024, 6:51 p.m. UTC | #2
On Fri, 11 Oct 2024 at 17:52, Jonathan Wakely <jwakely@redhat.com> wrote:
>
> On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus <rjj@nvidia.com> wrote:
> >
> > This patch modifies the implementation of the vectorized Mersenne
> > Twister random number generator to use __builtin_shufflevector instead
> > of __builtin_shuffle. This makes it (almost) compatible with Clang.
> >
> > To make the implementation fully compatible with Clang, Clang will need
> > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> > currently it does not. This looks like an oversight in Clang and so will
> > be addressed separately.
> >
> > I see no codegen change with this patch.
>
> I'm not qualified to review this myself, but I'd at least like to see
> the CI checks passing:
> https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7ee0@nvidia.com/
> Apparently the patch couldn't be applied.
>
> Please configure your email client (thunderbird?) to not munge the
> patch, or attach it rather than sending inline. Or just use
> git-send-email :-)
>
Hi!

The problem is not with how the patch was sent: patchwork managed to
see it as a patch, and the CI tried to apply it.
The problem is that for some reason, git was not able to apply the
patch to our current baseline.
Unfortunately, we do not go as far as calling 'git am
--show-current-patch=diff' or something else to provide more info in
our CI logs, so we can only guess that something went wrong. Maybe
your patch is based against a too old revision of GCC?

Thanks,

Christophe


>
> >
> > Bootstrapped and tested on aarch64-none-linux-gnu.
> >
> > Signed-off-by: Ricardo Jesus <rjj@nvidia.com>
> >
> > 2024-09-05  Ricardo Jesus  <rjj@nvidia.com>
> >
> >         * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> >         of __builtin_shuffle with __builtin_shufflevector.
> >         (__aarch64_lsl_128): Move shift amount to a template parameter.
> >         (__aarch64_lsr_128): Move shift amount to a template parameter.
> >         (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> >         and __aarch64_lsr_128.
> > ---
> >   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
> >   1 file changed, 16 insertions(+), 12 deletions(-)
> >
> > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > index 7f756d1572f..7eb816abcd0 100644
> > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > @@ -35,13 +35,13 @@
> >   #ifdef __ARM_NEON
> >
> >   #ifdef __ARM_BIG_ENDIAN
> > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> > -    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > -     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> > +    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > +    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> >   #else
> > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> > -    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > -     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> > +    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > +    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
> >   #endif
> >
> >   #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >     namespace {
> >       // Logical Shift right 128-bits by c * 8 bits
> >
> > -    __extension__ extern __inline __Uint32x4_t
> > +    __extension__
> > +    template<int __c>
> > +    extern __inline __Uint32x4_t
> >       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> > +    __aarch64_lsr_128 (__Uint8x16_t __a)
> >       {
> >         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> >                                    0, 0, 0, 0, 0, 0, 0, 0};
> > @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >
> >       // Logical Shift left 128-bits by c * 8 bits
> >
> > -    __extension__ extern __inline __Uint32x4_t
> > +    __extension__
> > +    template<int __c>
> > +    extern __inline __Uint32x4_t
> >       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> > +    __aarch64_lsl_128 (__Uint8x16_t __a)
> >       {
> >         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> >                                    0, 0, 0, 0, 0, 0, 0, 0};
> > @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >                                                __Uint32x4_t __e)
> >       {
> >         __Uint32x4_t __y = (__b >> __sr1);
> > -      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> > +      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
> >
> >         __Uint32x4_t __v = __d << __sl1;
> >
> >         __z = __z ^ __a;
> >         __z = __z ^ __v;
> >
> > -      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
> > +      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
> >
> >         __y = __y & __e;
> >         __z = __z ^ __x;
> > --
> > 2.44.0
> >
>
Jonathan Wakely Oct. 11, 2024, 7 p.m. UTC | #3
On Fri, 11 Oct 2024 at 19:52, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> On Fri, 11 Oct 2024 at 17:52, Jonathan Wakely <jwakely@redhat.com> wrote:
> >
> > On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus <rjj@nvidia.com> wrote:
> > >
> > > This patch modifies the implementation of the vectorized Mersenne
> > > Twister random number generator to use __builtin_shufflevector instead
> > > of __builtin_shuffle. This makes it (almost) compatible with Clang.
> > >
> > > To make the implementation fully compatible with Clang, Clang will need
> > > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> > > currently it does not. This looks like an oversight in Clang and so will
> > > be addressed separately.
> > >
> > > I see no codegen change with this patch.
> >
> > I'm not qualified to review this myself, but I'd at least like to see
> > the CI checks passing:
> > https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7ee0@nvidia.com/
> > Apparently the patch couldn't be applied.
> >
> > Please configure your email client (thunderbird?) to not munge the
> > patch, or attach it rather than sending inline. Or just use
> > git-send-email :-)
> >
> Hi!
>
> The problem is not with how the patch was sent: patchwork managed to
> see it as a patch, and the CI tried to apply it.
> The problem is that for some reason, git was not able to apply the
> patch to our current baseline.
> Unfortunately, we do not go as far as calling 'git am
> --show-current-patch=diff' or something else to provide more info in
> our CI logs, so we can only guess that something went wrong. Maybe
> your patch is based against a too old revision of GCC?

No, that file hasn't changed anywhere near the patch. The problem is
that the patch was munged by thunderbird, adding line breaks where
they corrupt the patch:


Applying: Use shufflevector instead of shuffle in opt_random.h
error: git diff header lacks filename information when removing 1
leading pathname component (line 6)
Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h

I fixed that manually, but it still fails:

Applying: Use shufflevector instead of shuffle in opt_random.h
error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:35
error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
does not apply
Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h

That's because of an incorrect number of space characters on the
unchanged context lines around the +/- diffs. I fixed that manually,
and failed at the next chunk:

Applying: Use shufflevector instead of shuffle in opt_random.h
error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:52
error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
does not apply
Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h

So the problem is how the patch was sent.

>
> Thanks,
>
> Christophe
>
>
> >
> > >
> > > Bootstrapped and tested on aarch64-none-linux-gnu.
> > >
> > > Signed-off-by: Ricardo Jesus <rjj@nvidia.com>
> > >
> > > 2024-09-05  Ricardo Jesus  <rjj@nvidia.com>
> > >
> > >         * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> > >         of __builtin_shuffle with __builtin_shufflevector.
> > >         (__aarch64_lsl_128): Move shift amount to a template parameter.
> > >         (__aarch64_lsr_128): Move shift amount to a template parameter.
> > >         (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> > >         and __aarch64_lsr_128.
> > > ---
> > >   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
> > >   1 file changed, 16 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > index 7f756d1572f..7eb816abcd0 100644
> > > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > @@ -35,13 +35,13 @@
> > >   #ifdef __ARM_NEON
> > >
> > >   #ifdef __ARM_BIG_ENDIAN
> > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> > > -    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > > -     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> > > +    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > > +    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> > >   #else
> > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> > > -    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > > -     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> > > +    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > > +    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
> > >   #endif
> > >
> > >   #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > >     namespace {
> > >       // Logical Shift right 128-bits by c * 8 bits
> > >
> > > -    __extension__ extern __inline __Uint32x4_t
> > > +    __extension__
> > > +    template<int __c>
> > > +    extern __inline __Uint32x4_t
> > >       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > > -    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> > > +    __aarch64_lsr_128 (__Uint8x16_t __a)
> > >       {
> > >         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> > >                                    0, 0, 0, 0, 0, 0, 0, 0};
> > > @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > >
> > >       // Logical Shift left 128-bits by c * 8 bits
> > >
> > > -    __extension__ extern __inline __Uint32x4_t
> > > +    __extension__
> > > +    template<int __c>
> > > +    extern __inline __Uint32x4_t
> > >       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > > -    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> > > +    __aarch64_lsl_128 (__Uint8x16_t __a)
> > >       {
> > >         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> > >                                    0, 0, 0, 0, 0, 0, 0, 0};
> > > @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > >                                                __Uint32x4_t __e)
> > >       {
> > >         __Uint32x4_t __y = (__b >> __sr1);
> > > -      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> > > +      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
> > >
> > >         __Uint32x4_t __v = __d << __sl1;
> > >
> > >         __z = __z ^ __a;
> > >         __z = __z ^ __v;
> > >
> > > -      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
> > > +      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
> > >
> > >         __y = __y & __e;
> > >         __z = __z ^ __x;
> > > --
> > > 2.44.0
> > >
> >
>
Christophe Lyon Oct. 11, 2024, 7:15 p.m. UTC | #4
On Fri, 11 Oct 2024 at 21:00, Jonathan Wakely <jwakely@redhat.com> wrote:
>
> On Fri, 11 Oct 2024 at 19:52, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> >
> > On Fri, 11 Oct 2024 at 17:52, Jonathan Wakely <jwakely@redhat.com> wrote:
> > >
> > > On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus <rjj@nvidia.com> wrote:
> > > >
> > > > This patch modifies the implementation of the vectorized Mersenne
> > > > Twister random number generator to use __builtin_shufflevector instead
> > > > of __builtin_shuffle. This makes it (almost) compatible with Clang.
> > > >
> > > > To make the implementation fully compatible with Clang, Clang will need
> > > > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> > > > currently it does not. This looks like an oversight in Clang and so will
> > > > be addressed separately.
> > > >
> > > > I see no codegen change with this patch.
> > >
> > > I'm not qualified to review this myself, but I'd at least like to see
> > > the CI checks passing:
> > > https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7ee0@nvidia.com/
> > > Apparently the patch couldn't be applied.
> > >
> > > Please configure your email client (thunderbird?) to not munge the
> > > patch, or attach it rather than sending inline. Or just use
> > > git-send-email :-)
> > >
> > Hi!
> >
> > The problem is not with how the patch was sent: patchwork managed to
> > see it as a patch, and the CI tried to apply it.
> > The problem is that for some reason, git was not able to apply the
> > patch to our current baseline.
> > Unfortunately, we do not go as far as calling 'git am
> > --show-current-patch=diff' or something else to provide more info in
> > our CI logs, so we can only guess that something went wrong. Maybe
> > your patch is based against a too old revision of GCC?
>
> No, that file hasn't changed anywhere near the patch. The problem is
> that the patch was munged by thunderbird, adding line breaks where
> they corrupt the patch:
>
>
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: git diff header lacks filename information when removing 1
> leading pathname component (line 6)
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
>
> I fixed that manually, but it still fails:

I see.... Thanks for checking manually!
I didn't go further than looking at the build logs.

Thanks,

Christophe

>
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:35
> error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
> does not apply
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
>
> That's because of an incorrect number of space characters on the
> unchanged context lines around the +/- diffs. I fixed that manually,
> and failed at the next chunk:
>
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:52
> error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
> does not apply
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
>
> So the problem is how the patch was sent.
>
> >
> > Thanks,
> >
> > Christophe
> >
> >
> > >
> > > >
> > > > Bootstrapped and tested on aarch64-none-linux-gnu.
> > > >
> > > > Signed-off-by: Ricardo Jesus <rjj@nvidia.com>
> > > >
> > > > 2024-09-05  Ricardo Jesus  <rjj@nvidia.com>
> > > >
> > > >         * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> > > >         of __builtin_shuffle with __builtin_shufflevector.
> > > >         (__aarch64_lsl_128): Move shift amount to a template parameter.
> > > >         (__aarch64_lsr_128): Move shift amount to a template parameter.
> > > >         (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> > > >         and __aarch64_lsr_128.
> > > > ---
> > > >   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
> > > >   1 file changed, 16 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > > b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > > index 7f756d1572f..7eb816abcd0 100644
> > > > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > > @@ -35,13 +35,13 @@
> > > >   #ifdef __ARM_NEON
> > > >
> > > >   #ifdef __ARM_BIG_ENDIAN
> > > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> > > > -    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > > > -     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> > > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> > > > +    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > > > +    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> > > >   #else
> > > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> > > > -    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > > > -     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> > > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> > > > +    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > > > +    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
> > > >   #endif
> > > >
> > > >   #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > > > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > > >     namespace {
> > > >       // Logical Shift right 128-bits by c * 8 bits
> > > >
> > > > -    __extension__ extern __inline __Uint32x4_t
> > > > +    __extension__
> > > > +    template<int __c>
> > > > +    extern __inline __Uint32x4_t
> > > >       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > > > -    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> > > > +    __aarch64_lsr_128 (__Uint8x16_t __a)
> > > >       {
> > > >         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> > > >                                    0, 0, 0, 0, 0, 0, 0, 0};
> > > > @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > > >
> > > >       // Logical Shift left 128-bits by c * 8 bits
> > > >
> > > > -    __extension__ extern __inline __Uint32x4_t
> > > > +    __extension__
> > > > +    template<int __c>
> > > > +    extern __inline __Uint32x4_t
> > > >       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > > > -    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> > > > +    __aarch64_lsl_128 (__Uint8x16_t __a)
> > > >       {
> > > >         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> > > >                                    0, 0, 0, 0, 0, 0, 0, 0};
> > > > @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > > >                                                __Uint32x4_t __e)
> > > >       {
> > > >         __Uint32x4_t __y = (__b >> __sr1);
> > > > -      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> > > > +      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
> > > >
> > > >         __Uint32x4_t __v = __d << __sl1;
> > > >
> > > >         __z = __z ^ __a;
> > > >         __z = __z ^ __v;
> > > >
> > > > -      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
> > > > +      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
> > > >
> > > >         __y = __y & __e;
> > > >         __z = __z ^ __x;
> > > > --
> > > > 2.44.0
> > > >
> > >
> >
>
Ricardo Jesus Oct. 14, 2024, 1:30 p.m. UTC | #5
On 10/11/24 20:00, Jonathan Wakely wrote:
> 
> On Fri, 11 Oct 2024 at 19:52, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
>>
>> On Fri, 11 Oct 2024 at 17:52, Jonathan Wakely <jwakely@redhat.com> wrote:
>>>
>>> On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus <rjj@nvidia.com> wrote:
>>>>
>>>> This patch modifies the implementation of the vectorized Mersenne
>>>> Twister random number generator to use __builtin_shufflevector instead
>>>> of __builtin_shuffle. This makes it (almost) compatible with Clang.
>>>>
>>>> To make the implementation fully compatible with Clang, Clang will need
>>>> to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
>>>> currently it does not. This looks like an oversight in Clang and so will
>>>> be addressed separately.
>>>>
>>>> I see no codegen change with this patch.
>>>
>>> I'm not qualified to review this myself, but I'd at least like to see
>>> the CI checks passing:
>>> https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7ee0@nvidia.com/
>>> Apparently the patch couldn't be applied.
>>>
>>> Please configure your email client (thunderbird?) to not munge the
>>> patch, or attach it rather than sending inline. Or just use
>>> git-send-email :-)
>>>
>> Hi!
>>
>> The problem is not with how the patch was sent: patchwork managed to
>> see it as a patch, and the CI tried to apply it.
>> The problem is that for some reason, git was not able to apply the
>> patch to our current baseline.
>> Unfortunately, we do not go as far as calling 'git am
>> --show-current-patch=diff' or something else to provide more info in
>> our CI logs, so we can only guess that something went wrong. Maybe
>> your patch is based against a too old revision of GCC?
> 
> No, that file hasn't changed anywhere near the patch. The problem is
> that the patch was munged by thunderbird, adding line breaks where
> they corrupt the patch:
> 
> 
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: git diff header lacks filename information when removing 1
> leading pathname component (line 6)
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
> 
> I fixed that manually, but it still fails:
> 
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:35
> error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
> does not apply
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
> 
> That's because of an incorrect number of space characters on the
> unchanged context lines around the +/- diffs. I fixed that manually,
> and failed at the next chunk:
> 
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:52
> error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
> does not apply
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
> 
> So the problem is how the patch was sent.
> 

Hi,

I'm sorry about this. I've resubmitted the patch with git-send-email, please let me know if that still doesn't work.

Thanks,
Ricardo

>>
>> Thanks,
>>
>> Christophe
>>
>>
>>>
>>>>
>>>> Bootstrapped and tested on aarch64-none-linux-gnu.
>>>>
>>>> Signed-off-by: Ricardo Jesus <rjj@nvidia.com>
>>>>
>>>> 2024-09-05  Ricardo Jesus  <rjj@nvidia.com>
>>>>
>>>>         * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
>>>>         of __builtin_shuffle with __builtin_shufflevector.
>>>>         (__aarch64_lsl_128): Move shift amount to a template parameter.
>>>>         (__aarch64_lsr_128): Move shift amount to a template parameter.
>>>>         (__aarch64_recursion): Update call sites of __aarch64_lsl_128
>>>>         and __aarch64_lsr_128.
>>>> ---
>>>>   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
>>>>   1 file changed, 16 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
>>>> b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
>>>> index 7f756d1572f..7eb816abcd0 100644
>>>> --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
>>>> +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
>>>> @@ -35,13 +35,13 @@
>>>>   #ifdef __ARM_NEON
>>>>
>>>>   #ifdef __ARM_BIG_ENDIAN
>>>> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
>>>> -    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
>>>> -     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
>>>> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
>>>> +    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
>>>> +    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
>>>>   #else
>>>> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
>>>> -    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
>>>> -     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
>>>> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
>>>> +    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
>>>> +    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
>>>>   #endif
>>>>
>>>>   #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
>>>> @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>>>>     namespace {
>>>>       // Logical Shift right 128-bits by c * 8 bits
>>>>
>>>> -    __extension__ extern __inline __Uint32x4_t
>>>> +    __extension__
>>>> +    template<int __c>
>>>> +    extern __inline __Uint32x4_t
>>>>       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>>> -    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
>>>> +    __aarch64_lsr_128 (__Uint8x16_t __a)
>>>>       {
>>>>         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>>>>                                    0, 0, 0, 0, 0, 0, 0, 0};
>>>> @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>>>>
>>>>       // Logical Shift left 128-bits by c * 8 bits
>>>>
>>>> -    __extension__ extern __inline __Uint32x4_t
>>>> +    __extension__
>>>> +    template<int __c>
>>>> +    extern __inline __Uint32x4_t
>>>>       __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>>> -    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
>>>> +    __aarch64_lsl_128 (__Uint8x16_t __a)
>>>>       {
>>>>         const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>>>>                                    0, 0, 0, 0, 0, 0, 0, 0};
>>>> @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>>>>                                                __Uint32x4_t __e)
>>>>       {
>>>>         __Uint32x4_t __y = (__b >> __sr1);
>>>> -      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
>>>> +      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
>>>>
>>>>         __Uint32x4_t __v = __d << __sl1;
>>>>
>>>>         __z = __z ^ __a;
>>>>         __z = __z ^ __v;
>>>>
>>>> -      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
>>>> +      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
>>>>
>>>>         __y = __y & __e;
>>>>         __z = __z ^ __x;
>>>> --
>>>> 2.44.0
>>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h 
b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
index 7f756d1572f..7eb816abcd0 100644
--- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
+++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
@@ -35,13 +35,13 @@ 
  #ifdef __ARM_NEON

  #ifdef __ARM_BIG_ENDIAN
-# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
-    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
-     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
+# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
+    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
+    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
  #else
-# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
-    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
-     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
+# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
+    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
+    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
  #endif

  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
@@ -52,9 +52,11 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
    namespace {
      // Logical Shift right 128-bits by c * 8 bits

-    __extension__ extern __inline __Uint32x4_t
+    __extension__
+    template<int __c>
+    extern __inline __Uint32x4_t
      __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
+    __aarch64_lsr_128 (__Uint8x16_t __a)
      {
        const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
  				   0, 0, 0, 0, 0, 0, 0, 0};
@@ -64,9 +66,11 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION

      // Logical Shift left 128-bits by c * 8 bits

-    __extension__ extern __inline __Uint32x4_t
+    __extension__
+    template<int __c>
+    extern __inline __Uint32x4_t
      __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
+    __aarch64_lsl_128 (__Uint8x16_t __a)
      {
        const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
  				   0, 0, 0, 0, 0, 0, 0, 0};
@@ -82,14 +86,14 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
  					       __Uint32x4_t __e)
      {
        __Uint32x4_t __y = (__b >> __sr1);
-      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
+      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);

        __Uint32x4_t __v = __d << __sl1;

        __z = __z ^ __a;
        __z = __z ^ __v;

-      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
+      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);

        __y = __y & __e;
        __z = __z ^ __x;