Message ID | 20241014132802.506972-1-rjj@nvidia.com |
---|---|
State | New |
Headers | show |
Series | aarch64: libstdc++: Use shufflevector instead of shuffle in opt_random.h | expand |
> On 14 Oct 2024, at 15:28, Ricardo Jesus <rjj@nvidia.com> wrote: > > External email: Use caution opening links or attachments > > > This patch modifies the implementation of the vectorized mersenne > twister random number generator to use __builtin_shufflevector instead > of __builtin_shuffle. This makes it (almost) compatible with Clang. > > To make the implementation fully compatible with Clang, Clang will need > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which > currently it does not. This looks like an oversight in Clang and so will > be addressed separately. > > I see no codegen change with this patch. FWIW the change looks ok to me from an aarch64 perspective. The only potential problem could have been a compile error so as long as the CI buildbots are happy I think this would be safe to take. Thanks, Kyrill > > Bootstrapped and tested on aarch64-none-linux-gnu. > > Signed-off-by: Ricardo Jesus <rjj@nvidia.com> > > 2024-09-05 Ricardo Jesus <rjj@nvidia.com> > > * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses > of __builtin_shuffle with __builtin_shufflevector. > (__aarch64_lsl_128): Move shift amount to a template parameter. > (__aarch64_lsr_128): Move shift amount to a template parameter. > (__aarch64_recursion): Update call sites of __aarch64_lsl_128 > and __aarch64_lsr_128. > --- > .../config/cpu/aarch64/opt/ext/opt_random.h | 28 +++++++++++-------- > 1 file changed, 16 insertions(+), 12 deletions(-) > > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > index 7f756d1572f..7eb816abcd0 100644 > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > @@ -35,13 +35,13 @@ > #ifdef __ARM_NEON > > #ifdef __ARM_BIG_ENDIAN > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \ > - {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ > - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C}) > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \ > + 16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ > + 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C) > #else > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \ > - {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ > - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15}) > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \ > + _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ > + _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15) > #endif > > #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > namespace { > // Logical Shift right 128-bits by c * 8 bits > > - __extension__ extern __inline __Uint32x4_t > + __extension__ > + template<int __c> > + extern __inline __Uint32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > - __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c) > + __aarch64_lsr_128 (__Uint8x16_t __a) > { > const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, > 0, 0, 0, 0, 0, 0, 0, 0}; > @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > > // Logical Shift left 128-bits by c * 8 bits > > - __extension__ extern __inline __Uint32x4_t > + __extension__ > + template<int __c> > + extern __inline __Uint32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > - __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c) > + __aarch64_lsl_128 (__Uint8x16_t __a) > { > const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, > 0, 0, 0, 0, 0, 0, 0, 0}; > @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > __Uint32x4_t __e) > { > __Uint32x4_t __y = (__b >> __sr1); > - __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2); > + __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c); > > __Uint32x4_t __v = __d << __sl1; > > __z = __z ^ __a; > __z = __z ^ __v; > > - __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2); > + __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a); > > __y = __y & __e; > __z = __z ^ __x; > -- > 2.44.0 >
On Mon, 14 Oct 2024 at 14:36, Kyrylo Tkachov <ktkachov@nvidia.com> wrote: > > > > > On 14 Oct 2024, at 15:28, Ricardo Jesus <rjj@nvidia.com> wrote: > > > > External email: Use caution opening links or attachments > > > > > > This patch modifies the implementation of the vectorized mersenne > > twister random number generator to use __builtin_shufflevector instead > > of __builtin_shuffle. This makes it (almost) compatible with Clang. > > > > To make the implementation fully compatible with Clang, Clang will need > > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which > > currently it does not. This looks like an oversight in Clang and so will > > be addressed separately. > > > > I see no codegen change with this patch. > > FWIW the change looks ok to me from an aarch64 perspective. > The only potential problem could have been a compile error so as long as the CI buildbots are happy I think this would be safe to take. OK thanks for looking. I'll wait to see that CI is green (thanks for resubmitting it, Ricardo!) and then push it. > Thanks, > Kyrill > > > > > Bootstrapped and tested on aarch64-none-linux-gnu. > > > > Signed-off-by: Ricardo Jesus <rjj@nvidia.com> > > > > 2024-09-05 Ricardo Jesus <rjj@nvidia.com> > > > > * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses > > of __builtin_shuffle with __builtin_shufflevector. > > (__aarch64_lsl_128): Move shift amount to a template parameter. > > (__aarch64_lsr_128): Move shift amount to a template parameter. > > (__aarch64_recursion): Update call sites of __aarch64_lsl_128 > > and __aarch64_lsr_128. > > --- > > .../config/cpu/aarch64/opt/ext/opt_random.h | 28 +++++++++++-------- > > 1 file changed, 16 insertions(+), 12 deletions(-) > > > > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > > index 7f756d1572f..7eb816abcd0 100644 > > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > > @@ -35,13 +35,13 @@ > > #ifdef __ARM_NEON > > > > #ifdef __ARM_BIG_ENDIAN > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \ > > - {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ > > - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C}) > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \ > > + 16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ > > + 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C) > > #else > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \ > > - {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ > > - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15}) > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \ > > + _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ > > + _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15) > > #endif > > > > #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > > namespace { > > // Logical Shift right 128-bits by c * 8 bits > > > > - __extension__ extern __inline __Uint32x4_t > > + __extension__ > > + template<int __c> > > + extern __inline __Uint32x4_t > > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > - __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c) > > + __aarch64_lsr_128 (__Uint8x16_t __a) > > { > > const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, > > 0, 0, 0, 0, 0, 0, 0, 0}; > > @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > > > > // Logical Shift left 128-bits by c * 8 bits > > > > - __extension__ extern __inline __Uint32x4_t > > + __extension__ > > + template<int __c> > > + extern __inline __Uint32x4_t > > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > - __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c) > > + __aarch64_lsl_128 (__Uint8x16_t __a) > > { > > const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, > > 0, 0, 0, 0, 0, 0, 0, 0}; > > @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > > __Uint32x4_t __e) > > { > > __Uint32x4_t __y = (__b >> __sr1); > > - __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2); > > + __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c); > > > > __Uint32x4_t __v = __d << __sl1; > > > > __z = __z ^ __a; > > __z = __z ^ __v; > > > > - __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2); > > + __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a); > > > > __y = __y & __e; > > __z = __z ^ __x; > > -- > > 2.44.0 > > >
diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h index 7f756d1572f..7eb816abcd0 100644 --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h @@ -35,13 +35,13 @@ #ifdef __ARM_NEON #ifdef __ARM_BIG_ENDIAN -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \ - {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C}) +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \ + 16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ + 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C) #else -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \ - {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15}) +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \ + _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ + _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15) #endif #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION namespace { // Logical Shift right 128-bits by c * 8 bits - __extension__ extern __inline __Uint32x4_t + __extension__ + template<int __c> + extern __inline __Uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) - __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c) + __aarch64_lsr_128 (__Uint8x16_t __a) { const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Logical Shift left 128-bits by c * 8 bits - __extension__ extern __inline __Uint32x4_t + __extension__ + template<int __c> + extern __inline __Uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) - __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c) + __aarch64_lsl_128 (__Uint8x16_t __a) { const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __Uint32x4_t __e) { __Uint32x4_t __y = (__b >> __sr1); - __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2); + __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c); __Uint32x4_t __v = __d << __sl1; __z = __z ^ __a; __z = __z ^ __v; - __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2); + __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a); __y = __y & __e; __z = __z ^ __x;
This patch modifies the implementation of the vectorized mersenne twister random number generator to use __builtin_shufflevector instead of __builtin_shuffle. This makes it (almost) compatible with Clang. To make the implementation fully compatible with Clang, Clang will need to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which currently it does not. This looks like an oversight in Clang and so will be addressed separately. I see no codegen change with this patch. Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Ricardo Jesus <rjj@nvidia.com> 2024-09-05 Ricardo Jesus <rjj@nvidia.com> * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses of __builtin_shuffle with __builtin_shufflevector. (__aarch64_lsl_128): Move shift amount to a template parameter. (__aarch64_lsr_128): Move shift amount to a template parameter. (__aarch64_recursion): Update call sites of __aarch64_lsl_128 and __aarch64_lsr_128. --- .../config/cpu/aarch64/opt/ext/opt_random.h | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-)