Message ID | 20231214075402.464671-1-hongyu.wang@intel.com |
---|---|
State | New |
Headers | show |
Series | i386: Sync move_max/store_max with prefer-vector-width [PR112824] | expand |
On Thu, Dec 14, 2023 at 3:54 PM Hongyu Wang <hongyu.wang@intel.com> wrote: > > Hi, > > Currently move_max follows the tuning feature first, but ideally it > should sync with prefer-vector-width when it is explicitly set to keep > vector move and operation with same vector size. > > Bootstrapped/regtested on x86-64-pc-linux-gnu{-m32,} > > OK for trunk? > > gcc/ChangeLog: > > PR target/112824 > * config/i386/i386-options.cc (ix86_option_override_internal): > Sync ix86_move_max/ix86_store_max with prefer_vector_width when > it is explicitly set. > > gcc/testsuite/ChangeLog: > > PR target/112824 > * gcc.target/i386/pieces-memset-45.c: Remove > -mprefer-vector-width=256. > * g++.target/i386/pr112824-1.C: New test. > --- > gcc/config/i386/i386-options.cc | 8 +- > gcc/testsuite/g++.target/i386/pr112824-1.C | 113 ++++++++++++++++++ > .../gcc.target/i386/pieces-memset-45.c | 2 +- > 3 files changed, 120 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/g++.target/i386/pr112824-1.C > > diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc > index 588a0878c0d..440ef59ffff 100644 > --- a/gcc/config/i386/i386-options.cc > +++ b/gcc/config/i386/i386-options.cc > @@ -3012,7 +3012,9 @@ ix86_option_override_internal (bool main_args_p, > { > /* Set the maximum number of bits can be moved from memory to > memory efficiently. */ > - if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES]) > + if (opts_set->x_prefer_vector_width_type != PVW_NONE) > + opts->x_ix86_move_max = opts->x_prefer_vector_width_type; > + else if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES]) > opts->x_ix86_move_max = PVW_AVX512; > else if (ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES]) > opts->x_ix86_move_max = PVW_AVX256; > @@ -3034,7 +3036,9 @@ ix86_option_override_internal (bool main_args_p, > { > /* Set the maximum number of bits can be stored to memory > efficiently. */ > - if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES]) > + if (opts_set->x_prefer_vector_width_type != PVW_NONE) > + opts->x_ix86_store_max = opts->x_prefer_vector_width_type; > + else if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES]) > opts->x_ix86_store_max = PVW_AVX512; > else if (ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES]) > opts->x_ix86_store_max = PVW_AVX256; > diff --git a/gcc/testsuite/g++.target/i386/pr112824-1.C b/gcc/testsuite/g++.target/i386/pr112824-1.C > new file mode 100644 > index 00000000000..fccaf23c530 > --- /dev/null > +++ b/gcc/testsuite/g++.target/i386/pr112824-1.C > @@ -0,0 +1,113 @@ > +/* PR target/112824 */ > +/* { dg-do compile } */ > +/* { dg-options "-std=c++23 -O3 -march=skylake-avx512 -mprefer-vector-width=512" } */ > +/* { dg-final { scan-assembler-not "vmov(?:dqu|apd)\[ \\t\]+\[^\n\]*%ymm" } } */ > + > + remove empty line. > +#include <bit> > +#include <concepts> > +#include <cstddef> > +#include <cstdint> > + > +template <ptrdiff_t W, typename T> > +using Vec [[gnu::vector_size(W * sizeof(T))]] = T; > + > +// Omitted: 16 without AVX, 32 without AVX512F, > +// or for forward compatibility some AVX10 may also mean 32-only > +static constexpr ptrdiff_t VectorBytes = 64; > +template<typename T> > +static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T); > + > +template <typename T, ptrdiff_t N> struct Vector{ > + static constexpr ptrdiff_t L = N; > + T data[L]; > + static constexpr auto size()->ptrdiff_t{return N;} > +}; > +template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{ > + static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> : ptrdiff_t(std::bit_ceil(size_t(N))); > + static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0); > + using V = Vec<W,T>; > + V data[L]; > + static constexpr auto size()->ptrdiff_t{return N;} > +}; > +/// should be trivially copyable > +/// codegen is worse when passing by value, even though it seems like it should make > +/// aliasing simpler to analyze? > +template<typename T, ptrdiff_t N> > +[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> { > + Vector<T,N> z; > + for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] + y.data[n]; > + return z; > +} > +template<typename T, ptrdiff_t N> > +[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> { > + Vector<T,N> z; > + for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] * y.data[n]; > + return z; > +} > +template<typename T, ptrdiff_t N> > +[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) -> Vector<T,N> { > + Vector<T,N> z; > + for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n]; > + return z; > +} > +template<typename T, ptrdiff_t N> > +[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) -> Vector<T,N> { > + Vector<T,N> z; > + for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n]; > + return z; > +} > + > + > + Ditto. > +template <typename T, ptrdiff_t N> struct Dual { > + T value; > + Vector<T, N> partials; > +}; > +// Here we have a specialization for non-power-of-2 `N` > +template <typename T, ptrdiff_t N> > +requires(std::floating_point<T> && (std::popcount(size_t(N))>1)) > +struct Dual<T,N> { > + Vector<T, N+1> data; > +}; > + > +template<ptrdiff_t W, typename T> > +consteval auto firstoff(){ > + static_assert(std::same_as<T,double>, "type not implemented"); > + if constexpr (W==2) return Vec<2,int64_t>{0,1} != 0; > + else if constexpr (W == 4) return Vec<4,int64_t>{0,1,2,3} != 0; > + else if constexpr (W == 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != 0; > + else static_assert(false, "vector width not implemented"); > +} > + > +template <typename T, ptrdiff_t N> > +[[gnu::always_inline]] constexpr auto operator+(Dual<T, N> a, > + Dual<T, N> b) > + -> Dual<T, N> { > + if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){ > + Dual<T,N> c; > + for (ptrdiff_t l = 0; l < Vector<T,N>::L; ++l) > + c.data.data[l] = a.data.data[l] + b.data.data[l]; > + return c; > + } else return {a.value + b.value, a.partials + b.partials}; > +} > + > +template <typename T, ptrdiff_t N> > +[[gnu::always_inline]] constexpr auto operator*(Dual<T, N> a, > + Dual<T, N> b) > + -> Dual<T, N> { > + if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){ > + using V = typename Vector<T,N>::V; > + V va = V{}+a.data.data[0][0], vb = V{}+b.data.data[0][0]; > + V x = va * b.data.data[0]; > + Dual<T,N> c; > + c.data.data[0] = firstoff<Vector<T,N>::W,T>() ? x + vb*a.data.data[0] : x; > + for (ptrdiff_t l = 1; l < Vector<T,N>::L; ++l) > + c.data.data[l] = va*b.data.data[l] + vb*a.data.data[l]; > + return c; > + } else return {a.value * b.value, a.value * b.partials + b.value * a.partials}; > +} > + > +void prod(Dual<Dual<double,7>,2> &c, const Dual<Dual<double,7>,2> &a, const Dual<Dual<double,7>,2>&b){ > + c = a*b; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-45.c b/gcc/testsuite/gcc.target/i386/pieces-memset-45.c > index 70c80e5064b..e8ce7c23256 100644 > --- a/gcc/testsuite/gcc.target/i386/pieces-memset-45.c > +++ b/gcc/testsuite/gcc.target/i386/pieces-memset-45.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -march=x86-64 -mprefer-vector-width=256 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */ > +/* { dg-options "-O2 -march=x86-64 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */ > > extern char *dst; > > -- > 2.31.1 > Others LGTM.
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 588a0878c0d..440ef59ffff 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -3012,7 +3012,9 @@ ix86_option_override_internal (bool main_args_p, { /* Set the maximum number of bits can be moved from memory to memory efficiently. */ - if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES]) + if (opts_set->x_prefer_vector_width_type != PVW_NONE) + opts->x_ix86_move_max = opts->x_prefer_vector_width_type; + else if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES]) opts->x_ix86_move_max = PVW_AVX512; else if (ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES]) opts->x_ix86_move_max = PVW_AVX256; @@ -3034,7 +3036,9 @@ ix86_option_override_internal (bool main_args_p, { /* Set the maximum number of bits can be stored to memory efficiently. */ - if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES]) + if (opts_set->x_prefer_vector_width_type != PVW_NONE) + opts->x_ix86_store_max = opts->x_prefer_vector_width_type; + else if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES]) opts->x_ix86_store_max = PVW_AVX512; else if (ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES]) opts->x_ix86_store_max = PVW_AVX256; diff --git a/gcc/testsuite/g++.target/i386/pr112824-1.C b/gcc/testsuite/g++.target/i386/pr112824-1.C new file mode 100644 index 00000000000..fccaf23c530 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr112824-1.C @@ -0,0 +1,113 @@ +/* PR target/112824 */ +/* { dg-do compile } */ +/* { dg-options "-std=c++23 -O3 -march=skylake-avx512 -mprefer-vector-width=512" } */ +/* { dg-final { scan-assembler-not "vmov(?:dqu|apd)\[ \\t\]+\[^\n\]*%ymm" } } */ + + +#include <bit> +#include <concepts> +#include <cstddef> +#include <cstdint> + +template <ptrdiff_t W, typename T> +using Vec [[gnu::vector_size(W * sizeof(T))]] = T; + +// Omitted: 16 without AVX, 32 without AVX512F, +// or for forward compatibility some AVX10 may also mean 32-only +static constexpr ptrdiff_t VectorBytes = 64; +template<typename T> +static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T); + +template <typename T, ptrdiff_t N> struct Vector{ + static constexpr ptrdiff_t L = N; + T data[L]; + static constexpr auto size()->ptrdiff_t{return N;} +}; +template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{ + static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> : ptrdiff_t(std::bit_ceil(size_t(N))); + static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0); + using V = Vec<W,T>; + V data[L]; + static constexpr auto size()->ptrdiff_t{return N;} +}; +/// should be trivially copyable +/// codegen is worse when passing by value, even though it seems like it should make +/// aliasing simpler to analyze? +template<typename T, ptrdiff_t N> +[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> { + Vector<T,N> z; + for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] + y.data[n]; + return z; +} +template<typename T, ptrdiff_t N> +[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> { + Vector<T,N> z; + for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] * y.data[n]; + return z; +} +template<typename T, ptrdiff_t N> +[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) -> Vector<T,N> { + Vector<T,N> z; + for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n]; + return z; +} +template<typename T, ptrdiff_t N> +[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) -> Vector<T,N> { + Vector<T,N> z; + for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n]; + return z; +} + + + +template <typename T, ptrdiff_t N> struct Dual { + T value; + Vector<T, N> partials; +}; +// Here we have a specialization for non-power-of-2 `N` +template <typename T, ptrdiff_t N> +requires(std::floating_point<T> && (std::popcount(size_t(N))>1)) +struct Dual<T,N> { + Vector<T, N+1> data; +}; + +template<ptrdiff_t W, typename T> +consteval auto firstoff(){ + static_assert(std::same_as<T,double>, "type not implemented"); + if constexpr (W==2) return Vec<2,int64_t>{0,1} != 0; + else if constexpr (W == 4) return Vec<4,int64_t>{0,1,2,3} != 0; + else if constexpr (W == 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != 0; + else static_assert(false, "vector width not implemented"); +} + +template <typename T, ptrdiff_t N> +[[gnu::always_inline]] constexpr auto operator+(Dual<T, N> a, + Dual<T, N> b) + -> Dual<T, N> { + if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){ + Dual<T,N> c; + for (ptrdiff_t l = 0; l < Vector<T,N>::L; ++l) + c.data.data[l] = a.data.data[l] + b.data.data[l]; + return c; + } else return {a.value + b.value, a.partials + b.partials}; +} + +template <typename T, ptrdiff_t N> +[[gnu::always_inline]] constexpr auto operator*(Dual<T, N> a, + Dual<T, N> b) + -> Dual<T, N> { + if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){ + using V = typename Vector<T,N>::V; + V va = V{}+a.data.data[0][0], vb = V{}+b.data.data[0][0]; + V x = va * b.data.data[0]; + Dual<T,N> c; + c.data.data[0] = firstoff<Vector<T,N>::W,T>() ? x + vb*a.data.data[0] : x; + for (ptrdiff_t l = 1; l < Vector<T,N>::L; ++l) + c.data.data[l] = va*b.data.data[l] + vb*a.data.data[l]; + return c; + } else return {a.value * b.value, a.value * b.partials + b.value * a.partials}; +} + +void prod(Dual<Dual<double,7>,2> &c, const Dual<Dual<double,7>,2> &a, const Dual<Dual<double,7>,2>&b){ + c = a*b; +} diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-45.c b/gcc/testsuite/gcc.target/i386/pieces-memset-45.c index 70c80e5064b..e8ce7c23256 100644 --- a/gcc/testsuite/gcc.target/i386/pieces-memset-45.c +++ b/gcc/testsuite/gcc.target/i386/pieces-memset-45.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -march=x86-64 -mprefer-vector-width=256 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */ +/* { dg-options "-O2 -march=x86-64 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */ extern char *dst;