Message ID | 20240620152811.2020226-1-jwakely@redhat.com |
---|---|
State | New |
Headers | show |
Series | libstdc++: Fix std::fill and std::fill_n optimizations [PR109150] | expand |
Pushed to trunk now. On Thu, 20 Jun 2024 at 16:28, Jonathan Wakely <jwakely@redhat.com> wrote: > > I think the new conditions are correct. They're certainly an improvment > on just checking __is_scalar without considering what we're assigning it > to. > > Tested x86_64-linux. > > -- >8 -- > > As noted in the PR, the optimization used for scalar types in std::fill > and std::fill_n is non-conforming, because it doesn't consider that > assigning a scalar type might have non-trivial side effects which are > affected by the optimization. > > By changing the condition under which the optimization is done we ensure > it's only performed when safe to do so, and we also enable it for > additional types, which was the original subject of the PR. > > Instead of two overloads using __enable_if<__is_scalar<T>::__value, R> > we can combine them into one and create a local variable which is either > a local copy of __value or another reference to it, depending on whether > the optimization is allowed. > > This removes a use of std::__is_scalar, which is a step towards fixing > PR 115497 by removing std::__is_pointer from <bits/cpp_type_traits.h> > > libstdc++-v3/ChangeLog: > > PR libstdc++/109150 > * include/bits/stl_algobase.h (__fill_a1): Combine the > !__is_scalar and __is_scalar overloads into one and rewrite the > condition used to decide whether to perform the load outside the > loop. > * testsuite/25_algorithms/fill/109150.cc: New test. > * testsuite/25_algorithms/fill_n/109150.cc: New test. > --- > libstdc++-v3/include/bits/stl_algobase.h | 75 ++++++++++++------- > .../testsuite/25_algorithms/fill/109150.cc | 62 +++++++++++++++ > .../testsuite/25_algorithms/fill_n/109150.cc | 62 +++++++++++++++ > 3 files changed, 171 insertions(+), 28 deletions(-) > create mode 100644 libstdc++-v3/testsuite/25_algorithms/fill/109150.cc > create mode 100644 libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc > > diff --git a/libstdc++-v3/include/bits/stl_algobase.h b/libstdc++-v3/include/bits/stl_algobase.h > index d831e0e9883..1a0f8c14073 100644 > --- a/libstdc++-v3/include/bits/stl_algobase.h > +++ b/libstdc++-v3/include/bits/stl_algobase.h > @@ -929,28 +929,39 @@ _GLIBCXX_END_NAMESPACE_CONTAINER > #define _GLIBCXX_MOVE_BACKWARD3(_Tp, _Up, _Vp) std::copy_backward(_Tp, _Up, _Vp) > #endif > > +#pragma GCC diagnostic push > +#pragma GCC diagnostic ignored "-Wc++17-extensions" > template<typename _ForwardIterator, typename _Tp> > _GLIBCXX20_CONSTEXPR > - inline typename > - __gnu_cxx::__enable_if<!__is_scalar<_Tp>::__value, void>::__type > + inline void > __fill_a1(_ForwardIterator __first, _ForwardIterator __last, > const _Tp& __value) > { > - for (; __first != __last; ++__first) > - *__first = __value; > - } > + // We can optimize this loop by moving the load from __value outside > + // the loop, but only if we know that making that copy is trivial, > + // and the assignment in the loop is also trivial (so that the identity > + // of the operand doesn't matter). > + const bool __load_outside_loop = > +#if __has_builtin(__is_trivially_constructible) \ > + && __has_builtin(__is_trivially_assignable) > + __is_trivially_constructible(_Tp, const _Tp&) > + && __is_trivially_assignable(__decltype(*__first), const _Tp&) > +#else > + __is_trivially_copyable(_Tp) > + && __is_same(_Tp, __typeof__(*__first)) > +#endif > + && sizeof(_Tp) <= sizeof(long long); > > - template<typename _ForwardIterator, typename _Tp> > - _GLIBCXX20_CONSTEXPR > - inline typename > - __gnu_cxx::__enable_if<__is_scalar<_Tp>::__value, void>::__type > - __fill_a1(_ForwardIterator __first, _ForwardIterator __last, > - const _Tp& __value) > - { > - const _Tp __tmp = __value; > + // When the condition is true, we use a copy of __value, > + // otherwise we just use another reference. > + typedef typename __gnu_cxx::__conditional_type<__load_outside_loop, > + const _Tp, > + const _Tp&>::__type _Up; > + _Up __val(__value); > for (; __first != __last; ++__first) > - *__first = __tmp; > + *__first = __val; > } > +#pragma GCC diagnostic pop > > // Specialization: for char types we can use memset. > template<typename _Tp> > @@ -1079,28 +1090,36 @@ _GLIBCXX_END_NAMESPACE_CONTAINER > __size_to_integer(__float128 __n) { return (long long)__n; } > #endif > > +#pragma GCC diagnostic push > +#pragma GCC diagnostic ignored "-Wc++17-extensions" > template<typename _OutputIterator, typename _Size, typename _Tp> > _GLIBCXX20_CONSTEXPR > - inline typename > - __gnu_cxx::__enable_if<!__is_scalar<_Tp>::__value, _OutputIterator>::__type > + inline _OutputIterator > __fill_n_a1(_OutputIterator __first, _Size __n, const _Tp& __value) > { > - for (; __n > 0; --__n, (void) ++__first) > - *__first = __value; > - return __first; > - } > + // See std::__fill_a1 for explanation of this condition. > + const bool __load_outside_loop = > +#if __has_builtin(__is_trivially_constructible) \ > + && __has_builtin(__is_trivially_assignable) > + __is_trivially_constructible(_Tp, const _Tp&) > + && __is_trivially_assignable(__decltype(*__first), const _Tp&) > +#else > + __is_trivially_copyable(_Tp) > + && __is_same(_Tp, __typeof__(*__first)) > +#endif > + && sizeof(_Tp) <= sizeof(long long); > > - template<typename _OutputIterator, typename _Size, typename _Tp> > - _GLIBCXX20_CONSTEXPR > - inline typename > - __gnu_cxx::__enable_if<__is_scalar<_Tp>::__value, _OutputIterator>::__type > - __fill_n_a1(_OutputIterator __first, _Size __n, const _Tp& __value) > - { > - const _Tp __tmp = __value; > + // When the condition is true, we use a copy of __value, > + // otherwise we just use another reference. > + typedef typename __gnu_cxx::__conditional_type<__load_outside_loop, > + const _Tp, > + const _Tp&>::__type _Up; > + _Up __val(__value); > for (; __n > 0; --__n, (void) ++__first) > - *__first = __tmp; > + *__first = __val; > return __first; > } > +#pragma GCC diagnostic pop > > template<typename _Ite, typename _Seq, typename _Cat, typename _Size, > typename _Tp> > diff --git a/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc b/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc > new file mode 100644 > index 00000000000..9491a998ff0 > --- /dev/null > +++ b/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc > @@ -0,0 +1,62 @@ > +// { dg-do run } > + > +// Test the problematic cases identified in PR libstdc++/109150 > +// where the previous std::fill was non-conforming. > + > +#include <algorithm> > +#include <testsuite_hooks.h> > + > +const int global = 0; > + > +struct X { > + void operator=(const int& i) { VERIFY(&i == &global); } > +}; > + > +void > +test_identity_matters() > +{ > + X x; > + // Assigning int to X has non-trivial side effects, so we cannot > + // hoist the load outside the loop, we have to do exactly what the > + // standard says to do. > + std::fill(&x, &x+1, global); > +} > + > +struct Y { > + int i; > + void operator=(int ii) { i = ii + 1; } > +}; > + > +void > +test_self_aliasing() > +{ > + Y y[2] = { }; > + // Assigning int to X has non-trivial side effects, altering the value > + // used to fill the later elements. Must not load it outside the loop. > + std::fill(y, y+2, y[0].i); > + VERIFY(y[1].i == 2); > +} > + > +struct Z > +{ > + Z() { } > +#if __cplusplus >= 201103L > + explicit Z(const Z&) = default; > +#endif > +}; > + > +void > +test_explicit_copy_ctor() > +{ > + Z z; > + // The optimization that copies the fill value must use direct-initialization > + // otherwise this case would be ill-formed due to the explicit constructor. > + std::fill(&z, &z, z); > +} > + > +int main() > +{ > + test_identity_matters(); > + test_self_aliasing(); > + test_explicit_copy_ctor(); > +} > diff --git a/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc b/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc > new file mode 100644 > index 00000000000..13bb31c9344 > --- /dev/null > +++ b/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc > @@ -0,0 +1,62 @@ > +// { dg-do run } > + > +// Test the problematic cases identified in PR libstdc++/109150 > +// where the previous std::fill_n was non-conforming. > + > +#include <algorithm> > +#include <testsuite_hooks.h> > + > +const int global = 0; > + > +struct X { > + void operator=(const int& i) { VERIFY(&i == &global); } > +}; > + > +void > +test_identity_matters() > +{ > + X x; > + // Assigning int to X has non-trivial side effects, so we cannot > + // hoist the load outside the loop, we have to do exactly what the > + // standard says to do. > + std::fill_n(&x, 1, global); > +} > + > +struct Y { > + int i; > + void operator=(int ii) { i = ii + 1; } > +}; > + > +void > +test_self_aliasing() > +{ > + Y y[2] = { }; > + // Assigning int to X has non-trivial side effects, altering the value > + // used to fill the later elements. Must not load it outside the loop. > + std::fill_n(y, 2, y[0].i); > + VERIFY(y[1].i == 2); > +} > + > +struct Z > +{ > + Z() { } > +#if __cplusplus >= 201103L > + explicit Z(const Z&) = default; > +#endif > +}; > + > +void > +test_explicit_copy_ctor() > +{ > + Z z; > + // The optimization that copies the fill value must use direct-initialization > + // otherwise this case would be ill-formed due to the explicit constructor. > + std::fill_n(&z, 1, z); > +} > + > +int main() > +{ > + test_identity_matters(); > + test_self_aliasing(); > + test_explicit_copy_ctor(); > +} > -- > 2.45.2 >
diff --git a/libstdc++-v3/include/bits/stl_algobase.h b/libstdc++-v3/include/bits/stl_algobase.h index d831e0e9883..1a0f8c14073 100644 --- a/libstdc++-v3/include/bits/stl_algobase.h +++ b/libstdc++-v3/include/bits/stl_algobase.h @@ -929,28 +929,39 @@ _GLIBCXX_END_NAMESPACE_CONTAINER #define _GLIBCXX_MOVE_BACKWARD3(_Tp, _Up, _Vp) std::copy_backward(_Tp, _Up, _Vp) #endif +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wc++17-extensions" template<typename _ForwardIterator, typename _Tp> _GLIBCXX20_CONSTEXPR - inline typename - __gnu_cxx::__enable_if<!__is_scalar<_Tp>::__value, void>::__type + inline void __fill_a1(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { - for (; __first != __last; ++__first) - *__first = __value; - } + // We can optimize this loop by moving the load from __value outside + // the loop, but only if we know that making that copy is trivial, + // and the assignment in the loop is also trivial (so that the identity + // of the operand doesn't matter). + const bool __load_outside_loop = +#if __has_builtin(__is_trivially_constructible) \ + && __has_builtin(__is_trivially_assignable) + __is_trivially_constructible(_Tp, const _Tp&) + && __is_trivially_assignable(__decltype(*__first), const _Tp&) +#else + __is_trivially_copyable(_Tp) + && __is_same(_Tp, __typeof__(*__first)) +#endif + && sizeof(_Tp) <= sizeof(long long); - template<typename _ForwardIterator, typename _Tp> - _GLIBCXX20_CONSTEXPR - inline typename - __gnu_cxx::__enable_if<__is_scalar<_Tp>::__value, void>::__type - __fill_a1(_ForwardIterator __first, _ForwardIterator __last, - const _Tp& __value) - { - const _Tp __tmp = __value; + // When the condition is true, we use a copy of __value, + // otherwise we just use another reference. + typedef typename __gnu_cxx::__conditional_type<__load_outside_loop, + const _Tp, + const _Tp&>::__type _Up; + _Up __val(__value); for (; __first != __last; ++__first) - *__first = __tmp; + *__first = __val; } +#pragma GCC diagnostic pop // Specialization: for char types we can use memset. template<typename _Tp> @@ -1079,28 +1090,36 @@ _GLIBCXX_END_NAMESPACE_CONTAINER __size_to_integer(__float128 __n) { return (long long)__n; } #endif +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wc++17-extensions" template<typename _OutputIterator, typename _Size, typename _Tp> _GLIBCXX20_CONSTEXPR - inline typename - __gnu_cxx::__enable_if<!__is_scalar<_Tp>::__value, _OutputIterator>::__type + inline _OutputIterator __fill_n_a1(_OutputIterator __first, _Size __n, const _Tp& __value) { - for (; __n > 0; --__n, (void) ++__first) - *__first = __value; - return __first; - } + // See std::__fill_a1 for explanation of this condition. + const bool __load_outside_loop = +#if __has_builtin(__is_trivially_constructible) \ + && __has_builtin(__is_trivially_assignable) + __is_trivially_constructible(_Tp, const _Tp&) + && __is_trivially_assignable(__decltype(*__first), const _Tp&) +#else + __is_trivially_copyable(_Tp) + && __is_same(_Tp, __typeof__(*__first)) +#endif + && sizeof(_Tp) <= sizeof(long long); - template<typename _OutputIterator, typename _Size, typename _Tp> - _GLIBCXX20_CONSTEXPR - inline typename - __gnu_cxx::__enable_if<__is_scalar<_Tp>::__value, _OutputIterator>::__type - __fill_n_a1(_OutputIterator __first, _Size __n, const _Tp& __value) - { - const _Tp __tmp = __value; + // When the condition is true, we use a copy of __value, + // otherwise we just use another reference. + typedef typename __gnu_cxx::__conditional_type<__load_outside_loop, + const _Tp, + const _Tp&>::__type _Up; + _Up __val(__value); for (; __n > 0; --__n, (void) ++__first) - *__first = __tmp; + *__first = __val; return __first; } +#pragma GCC diagnostic pop template<typename _Ite, typename _Seq, typename _Cat, typename _Size, typename _Tp> diff --git a/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc b/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc new file mode 100644 index 00000000000..9491a998ff0 --- /dev/null +++ b/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc @@ -0,0 +1,62 @@ +// { dg-do run } + +// Test the problematic cases identified in PR libstdc++/109150 +// where the previous std::fill was non-conforming. + +#include <algorithm> +#include <testsuite_hooks.h> + +const int global = 0; + +struct X { + void operator=(const int& i) { VERIFY(&i == &global); } +}; + +void +test_identity_matters() +{ + X x; + // Assigning int to X has non-trivial side effects, so we cannot + // hoist the load outside the loop, we have to do exactly what the + // standard says to do. + std::fill(&x, &x+1, global); +} + +struct Y { + int i; + void operator=(int ii) { i = ii + 1; } +}; + +void +test_self_aliasing() +{ + Y y[2] = { }; + // Assigning int to X has non-trivial side effects, altering the value + // used to fill the later elements. Must not load it outside the loop. + std::fill(y, y+2, y[0].i); + VERIFY(y[1].i == 2); +} + +struct Z +{ + Z() { } +#if __cplusplus >= 201103L + explicit Z(const Z&) = default; +#endif +}; + +void +test_explicit_copy_ctor() +{ + Z z; + // The optimization that copies the fill value must use direct-initialization + // otherwise this case would be ill-formed due to the explicit constructor. + std::fill(&z, &z, z); +} + +int main() +{ + test_identity_matters(); + test_self_aliasing(); + test_explicit_copy_ctor(); +} diff --git a/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc b/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc new file mode 100644 index 00000000000..13bb31c9344 --- /dev/null +++ b/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc @@ -0,0 +1,62 @@ +// { dg-do run } + +// Test the problematic cases identified in PR libstdc++/109150 +// where the previous std::fill_n was non-conforming. + +#include <algorithm> +#include <testsuite_hooks.h> + +const int global = 0; + +struct X { + void operator=(const int& i) { VERIFY(&i == &global); } +}; + +void +test_identity_matters() +{ + X x; + // Assigning int to X has non-trivial side effects, so we cannot + // hoist the load outside the loop, we have to do exactly what the + // standard says to do. + std::fill_n(&x, 1, global); +} + +struct Y { + int i; + void operator=(int ii) { i = ii + 1; } +}; + +void +test_self_aliasing() +{ + Y y[2] = { }; + // Assigning int to X has non-trivial side effects, altering the value + // used to fill the later elements. Must not load it outside the loop. + std::fill_n(y, 2, y[0].i); + VERIFY(y[1].i == 2); +} + +struct Z +{ + Z() { } +#if __cplusplus >= 201103L + explicit Z(const Z&) = default; +#endif +}; + +void +test_explicit_copy_ctor() +{ + Z z; + // The optimization that copies the fill value must use direct-initialization + // otherwise this case would be ill-formed due to the explicit constructor. + std::fill_n(&z, 1, z); +} + +int main() +{ + test_identity_matters(); + test_self_aliasing(); + test_explicit_copy_ctor(); +}