diff mbox series

aarch64: libstdc++: Use shufflevector instead of shuffle in opt_random.h

Message ID 20241014132802.506972-1-rjj@nvidia.com
State New
Headers show
Series aarch64: libstdc++: Use shufflevector instead of shuffle in opt_random.h | expand

Commit Message

Ricardo Jesus Oct. 14, 2024, 1:28 p.m. UTC
This patch modifies the implementation of the vectorized mersenne
twister random number generator to use __builtin_shufflevector instead
of __builtin_shuffle. This makes it (almost) compatible with Clang.

To make the implementation fully compatible with Clang, Clang will need
to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
currently it does not. This looks like an oversight in Clang and so will
be addressed separately.

I see no codegen change with this patch.

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Ricardo Jesus <rjj@nvidia.com>

2024-09-05  Ricardo Jesus  <rjj@nvidia.com>

	* config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
	of __builtin_shuffle with __builtin_shufflevector.
	(__aarch64_lsl_128): Move shift amount to a template parameter.
	(__aarch64_lsr_128): Move shift amount to a template parameter.
	(__aarch64_recursion): Update call sites of __aarch64_lsl_128
	and __aarch64_lsr_128.
---
 .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

Comments

Kyrylo Tkachov Oct. 14, 2024, 1:36 p.m. UTC | #1
> On 14 Oct 2024, at 15:28, Ricardo Jesus <rjj@nvidia.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> This patch modifies the implementation of the vectorized mersenne
> twister random number generator to use __builtin_shufflevector instead
> of __builtin_shuffle. This makes it (almost) compatible with Clang.
> 
> To make the implementation fully compatible with Clang, Clang will need
> to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> currently it does not. This looks like an oversight in Clang and so will
> be addressed separately.
> 
> I see no codegen change with this patch.

FWIW the change looks ok to me from an aarch64 perspective.
The only potential problem could have been a compile error so as long as the CI buildbots are happy I think this would be safe to take.
Thanks,
Kyrill

> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Signed-off-by: Ricardo Jesus <rjj@nvidia.com>
> 
> 2024-09-05  Ricardo Jesus  <rjj@nvidia.com>
> 
>        * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
>        of __builtin_shuffle with __builtin_shufflevector.
>        (__aarch64_lsl_128): Move shift amount to a template parameter.
>        (__aarch64_lsr_128): Move shift amount to a template parameter.
>        (__aarch64_recursion): Update call sites of __aarch64_lsl_128
>        and __aarch64_lsr_128.
> ---
> .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
> 1 file changed, 16 insertions(+), 12 deletions(-)
> 
> diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> index 7f756d1572f..7eb816abcd0 100644
> --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> @@ -35,13 +35,13 @@
> #ifdef __ARM_NEON
> 
> #ifdef __ARM_BIG_ENDIAN
> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> -    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> -     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> +    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> +    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> #else
> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> -    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> -     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> +    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> +    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
> #endif
> 
> #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>   namespace {
>     // Logical Shift right 128-bits by c * 8 bits
> 
> -    __extension__ extern __inline __Uint32x4_t
> +    __extension__
> +    template<int __c>
> +    extern __inline __Uint32x4_t
>     __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> +    __aarch64_lsr_128 (__Uint8x16_t __a)
>     {
>       const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>                                   0, 0, 0, 0, 0, 0, 0, 0};
> @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> 
>     // Logical Shift left 128-bits by c * 8 bits
> 
> -    __extension__ extern __inline __Uint32x4_t
> +    __extension__
> +    template<int __c>
> +    extern __inline __Uint32x4_t
>     __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> +    __aarch64_lsl_128 (__Uint8x16_t __a)
>     {
>       const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>                                   0, 0, 0, 0, 0, 0, 0, 0};
> @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>                                               __Uint32x4_t __e)
>     {
>       __Uint32x4_t __y = (__b >> __sr1);
> -      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> +      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
> 
>       __Uint32x4_t __v = __d << __sl1;
> 
>       __z = __z ^ __a;
>       __z = __z ^ __v;
> 
> -      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
> +      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
> 
>       __y = __y & __e;
>       __z = __z ^ __x;
> --
> 2.44.0
>
Jonathan Wakely Oct. 14, 2024, 1:37 p.m. UTC | #2
On Mon, 14 Oct 2024 at 14:36, Kyrylo Tkachov <ktkachov@nvidia.com> wrote:
>
>
>
> > On 14 Oct 2024, at 15:28, Ricardo Jesus <rjj@nvidia.com> wrote:
> >
> > External email: Use caution opening links or attachments
> >
> >
> > This patch modifies the implementation of the vectorized mersenne
> > twister random number generator to use __builtin_shufflevector instead
> > of __builtin_shuffle. This makes it (almost) compatible with Clang.
> >
> > To make the implementation fully compatible with Clang, Clang will need
> > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> > currently it does not. This looks like an oversight in Clang and so will
> > be addressed separately.
> >
> > I see no codegen change with this patch.
>
> FWIW the change looks ok to me from an aarch64 perspective.
> The only potential problem could have been a compile error so as long as the CI buildbots are happy I think this would be safe to take.

OK thanks for looking. I'll wait to see that CI is green (thanks for
resubmitting it, Ricardo!) and then push it.


> Thanks,
> Kyrill
>
> >
> > Bootstrapped and tested on aarch64-none-linux-gnu.
> >
> > Signed-off-by: Ricardo Jesus <rjj@nvidia.com>
> >
> > 2024-09-05  Ricardo Jesus  <rjj@nvidia.com>
> >
> >        * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> >        of __builtin_shuffle with __builtin_shufflevector.
> >        (__aarch64_lsl_128): Move shift amount to a template parameter.
> >        (__aarch64_lsr_128): Move shift amount to a template parameter.
> >        (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> >        and __aarch64_lsr_128.
> > ---
> > .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
> > 1 file changed, 16 insertions(+), 12 deletions(-)
> >
> > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > index 7f756d1572f..7eb816abcd0 100644
> > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > @@ -35,13 +35,13 @@
> > #ifdef __ARM_NEON
> >
> > #ifdef __ARM_BIG_ENDIAN
> > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> > -    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > -     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> > +    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > +    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> > #else
> > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> > -    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > -     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> > +    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > +    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
> > #endif
> >
> > #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >   namespace {
> >     // Logical Shift right 128-bits by c * 8 bits
> >
> > -    __extension__ extern __inline __Uint32x4_t
> > +    __extension__
> > +    template<int __c>
> > +    extern __inline __Uint32x4_t
> >     __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> > +    __aarch64_lsr_128 (__Uint8x16_t __a)
> >     {
> >       const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> >                                   0, 0, 0, 0, 0, 0, 0, 0};
> > @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >
> >     // Logical Shift left 128-bits by c * 8 bits
> >
> > -    __extension__ extern __inline __Uint32x4_t
> > +    __extension__
> > +    template<int __c>
> > +    extern __inline __Uint32x4_t
> >     __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> > +    __aarch64_lsl_128 (__Uint8x16_t __a)
> >     {
> >       const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> >                                   0, 0, 0, 0, 0, 0, 0, 0};
> > @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >                                               __Uint32x4_t __e)
> >     {
> >       __Uint32x4_t __y = (__b >> __sr1);
> > -      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> > +      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
> >
> >       __Uint32x4_t __v = __d << __sl1;
> >
> >       __z = __z ^ __a;
> >       __z = __z ^ __v;
> >
> > -      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
> > +      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
> >
> >       __y = __y & __e;
> >       __z = __z ^ __x;
> > --
> > 2.44.0
> >
>
diff mbox series

Patch

diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
index 7f756d1572f..7eb816abcd0 100644
--- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
+++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
@@ -35,13 +35,13 @@ 
 #ifdef __ARM_NEON
 
 #ifdef __ARM_BIG_ENDIAN
-# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
-    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
-     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
+# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
+    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
+    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
 #else
-# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
-    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
-     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
+# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
+    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
+    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
 #endif
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
@@ -52,9 +52,11 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
   namespace {
     // Logical Shift right 128-bits by c * 8 bits
 
-    __extension__ extern __inline __Uint32x4_t
+    __extension__
+    template<int __c>
+    extern __inline __Uint32x4_t
     __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
+    __aarch64_lsr_128 (__Uint8x16_t __a)
     {
       const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
 				   0, 0, 0, 0, 0, 0, 0, 0};
@@ -64,9 +66,11 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
     // Logical Shift left 128-bits by c * 8 bits
 
-    __extension__ extern __inline __Uint32x4_t
+    __extension__
+    template<int __c>
+    extern __inline __Uint32x4_t
     __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
+    __aarch64_lsl_128 (__Uint8x16_t __a)
     {
       const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
 				   0, 0, 0, 0, 0, 0, 0, 0};
@@ -82,14 +86,14 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 					       __Uint32x4_t __e)
     {
       __Uint32x4_t __y = (__b >> __sr1);
-      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
+      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
 
       __Uint32x4_t __v = __d << __sl1;
 
       __z = __z ^ __a;
       __z = __z ^ __v;
 
-      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
+      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
 
       __y = __y & __e;
       __z = __z ^ __x;