Message ID | 20240523063742.2333446-2-lin1.hu@intel.com |
---|---|
State | New |
Headers | show |
Series | Optimize __builtin_convertvector for x86-64-v4 and | expand |
On Thu, 23 May 2024, Hu, Lin1 wrote: > gcc/ChangeLog: > > PR target/107432 > * tree-vect-generic.cc > (supportable_indirect_narrowing_operation): New function for > support indirect narrowing convert. > (supportable_indirect_widening_operation): New function for > support indirect widening convert. > (expand_vector_conversion): Support convert for int -> int, > float -> float and int <-> float. > > gcc/testsuite/ChangeLog: > > PR target/107432 > * gcc.target/i386/pr107432-1.c: New test. > * gcc.target/i386/pr107432-2.c: Ditto. > * gcc.target/i386/pr107432-3.c: Ditto. > * gcc.target/i386/pr107432-4.c: Ditto. > * gcc.target/i386/pr107432-5.c: Ditto. > * gcc.target/i386/pr107432-6.c: Ditto. > * gcc.target/i386/pr107432-7.c: Ditto. > --- > gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++ > gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 +++++ > gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 +++++ > gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++++++ > gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++ > gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++ > gcc/tree-vect-generic.cc | 157 +++++++++++++- > 8 files changed, 968 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c > new file mode 100644 > index 00000000000..a4f37447eb4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c > @@ -0,0 +1,234 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ > + > +#include <x86intrin.h> > + > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > + > +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); > +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); > +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); > +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); > +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); > +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); > + > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2di)a, __v2si); > +} > + > +__m128i mm256_cvtepi64_epi32_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); > +} > + > +__m256i mm512_cvtepi64_epi32_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); > +} > + > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2di)a, __v2hi); > +} > + > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4di)a, __v4hi); > +} > + > +__m128i mm512_cvtepi64_epi16_builtin_convertvector(__m512i a) > +{ > + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); > +} > + > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2di)a, __v2qi); > +} > + > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4di)a, __v4qi); > +} > + > +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) > +{ > + return __builtin_convertvector((__v8di)a, __v8qi); > +} > + > +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) > +{ > + return __builtin_convertvector((__v2si)a, __v2hi); > +} > + > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4si)a, __v4hi); > +} > + > +__m128i mm256_cvtepi32_epi16_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); > +} > + > +__m256i mm512_cvtepi32_epi16_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); > +} > + > +__v2qi mm64_cvtepi32_epi8_builtin_convertvector(__v2si a) > +{ > + return __builtin_convertvector((__v2si)a, __v2qi); > +} > + > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4si)a, __v4qi); > +} > + > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v8si)a, __v8qi); > +} > + > +__m128i mm512_cvtepi32_epi8_builtin_convertvector(__m512i a) > +{ > + return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); > +} > + > +__v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a) > +{ > + return __builtin_convertvector((__v2hi)a, __v2qi); > +} > + > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v8hi)a, __v8qi); > +} > + > +__m128i mm256_cvtepi16_epi8_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); > +} > + > +__m256i mm512_cvtepi16_epi8_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); > +} > + > +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2du)a, __v2su); > +} > + > +__m128i mm256_cvtepu64_epu32_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v4du)a, __v4su); > +} > + > +__m256i mm512_cvtepu64_epu32_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v8du)a, __v8su); > +} > + > +__v2hu mm_cvtepu64_epu16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2du)a, __v2hu); > +} > + > +__v4hu mm256_cvtepu64_epu16_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4du)a, __v4hu); > +} > + > +__m128i mm512_cvtepu64_epu16_builtin_convertvector(__m512i a) > +{ > + return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); > +} > + > +__v2qu mm_cvtepu64_epu8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2du)a, __v2qu); > +} > + > +__v4qu mm256_cvtepu64_epu8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4du)a, __v4qu); > +} > + > +__v8qu mm512_cvtepu64_epu8_builtin_convertvector(__m512i a) > +{ > + return __builtin_convertvector((__v8du)a, __v8qu); > +} > + > +__v2hu mm32_cvtepu32_epu16_builtin_convertvector(__v2su a) > +{ > + return __builtin_convertvector((__v2su)a, __v2hu); > +} > + > +__v4hu mm_cvtepu32_epu16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4su)a, __v4hu); > +} > + > +__m128i mm256_cvtepu32_epu16_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); > +} > + > +__m256i mm512_cvtepu32_epu16_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); > +} > + > +__v2qu mm32_cvtepu32_epu8_builtin_convertvector(__v2su a) > +{ > + return __builtin_convertvector((__v2su)a, __v2qu); > +} > + > +__v4qu mm_cvtepu2_epu8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4su)a, __v4qu); > +} > + > +__v8qu mm256_cvtepu32_epu8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v8su)a, __v8qu); > +} > + > +__m128i mm512_cvtepu32_epu8_builtin_convertvector(__m512i a) > +{ > + return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); > +} > + > +__v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a) > +{ > + return __builtin_convertvector((__v2hu)a, __v2qu); > +} > + > +__v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v8hu)a, __v8qu); > +} > + > +__m128i mm256_cvtepu16_epu8_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); > +} > + > +__m256i mm512_cvtepu16_epu8_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c > new file mode 100644 > index 00000000000..02ffd811cb4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c > @@ -0,0 +1,105 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */ > + > +#include <x86intrin.h> > + > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > + > +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__m256i mm256_cvtepi32_epi64_builtin_convertvector(__v4si a) > +{ > + return (__m256i)__builtin_convertvector(a, __v4di); > +} > + > +__m512i mm512_cvtepi32_epi64_builtin_convertvector(__v8si a) > +{ > + return (__m512i)__builtin_convertvector(a, __v8di); > +} > + > +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__m256i mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v4di); > +} > + > +__m512i mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a) > +{ > + return (__m512i)__builtin_convertvector(a, __v8di); > +} > + > +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__m256i mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v4di); > +} > + > +__m512i mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a) > +{ > + return (__m512i)__builtin_convertvector(a, __v8di); > +} > + > +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) > +{ > + return (__m128i)__builtin_convertvector(a, __v4si); > +} > + > +__m256i mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v8si); > +} > + > +__m512i mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a) > +{ > + return (__m512i)__builtin_convertvector(a, __v16si); > +} > + > +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) > +{ > + return (__m128i)__builtin_convertvector(a, __v4si); > +} > + > +__m256i mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v8si); > +} > + > +__m512i mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a) > +{ > + return (__m512i)__builtin_convertvector(a, __v16si); > +} > + > +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) > +{ > + return (__m128i)__builtin_convertvector(a, __v8hi); > +} > + > +__m256i mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v16hi); > +} > + > +__v32hi mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a) > +{ > + return __builtin_convertvector(a, __v32hi); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c > new file mode 100644 > index 00000000000..30dc947b6dd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c > @@ -0,0 +1,55 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */ > + > +#include <x86intrin.h> > + > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > + > +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector(a, __v2sf); > +} > + > +__v4sf mm256_cvtpd_ps_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector(a, __v4sf); > +} > + > +__v8sf mm512_cvtpd_ps_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector(a, __v8sf); > +} > + > +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector(a, __v2hf); > +} > + > +__v4hf mm256_cvtpd_ph_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector(a, __v4hf); > +} > + > +__v8hf mm512_cvtpd_ph_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector(a, __v8hf); > +} > + > +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector(a, __v4hf); > +} > + > +__v8hf mm256_cvtps_ph_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector(a, __v8hf); > +} > + > +__v16hf mm512_cvtps_ph_builtin_convertvector(__v16sf a) > +{ > + return __builtin_convertvector(a, __v16hf); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c > new file mode 100644 > index 00000000000..e537e7349e4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c > @@ -0,0 +1,56 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */ > + > +#include <x86intrin.h> > + > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > + > +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) > +{ > + return __builtin_convertvector(a, __v2df); > +} > + > +__v4df mm256_cvtps_pd_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector(a, __v4df); > +} > + > +__v8df mm512_cvtps_pd_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector(a, __v8df); > +} > + > +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) > +{ > + return __builtin_convertvector(a, __v2df); > +} > + > +__v4df mm256_cvtph_pd_builtin_convertvector(__v4hf a) > +{ > + return __builtin_convertvector(a, __v4df); > +} > + > +__v8df mm512_cvtph_pd_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector(a, __v8df); > +} > + > +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) > +{ > + return __builtin_convertvector(a, __v4sf); > +} > + > +__v8sf mm256_cvtph_ps_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector(a, __v8sf); > +} > + > +__v16sf mm512_cvtph_ps_builtin_convertvector(__v16hf a) > +{ > + return __builtin_convertvector(a, __v16sf); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c > new file mode 100644 > index 00000000000..5a44ef9f3b9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c > @@ -0,0 +1,72 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */ > + > +#include <x86intrin.h> > + > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > + > +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector(a, __v2si); > +} > + > +__v4si mm256_cvtpd_epi32_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector(a, __v4si); > +} > + > +__v8si mm512_cvtpd_epi32_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector(a, __v8si); > +} > + > +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__v4di mm256_cvtps_epi64_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector(a, __v4di); > +} > + > +__v8di mm512_cvtps_epi64_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector(a, __v8di); > +} > + > +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) > +{ > + return __builtin_convertvector(a, __v4si); > +} > + > +__v8si mm256_cvtph_epi32_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector(a, __v8si); > +} > + > +__v16si mm512_cvtph_epi32_builtin_convertvector(__v16hf a) > +{ > + return __builtin_convertvector(a, __v16si); > +} > + > +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__v4di mm256_cvtph_epi64_builtin_convertvector(__v4hf a) > +{ > + return __builtin_convertvector(a, __v4di); > +} > + > +__v8di mm512_cvtph_epi64_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector(a, __v8di); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c > new file mode 100644 > index 00000000000..4a68a10b089 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c > @@ -0,0 +1,139 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */ > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */ > +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ > + > +#include <x86intrin.h> > + > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > +typedef char __v16qi __attribute__ ((__vector_size__ (16))); > +typedef unsigned char __v2qu __attribute__ ((vector_size (2))); > +typedef unsigned char __v4qu __attribute__ ((vector_size (4))); > +typedef unsigned char __v8qu __attribute__ ((vector_size (8))); > +typedef unsigned char __v16qu __attribute__ ((vector_size (16))); > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); > + > +__v2qi mm_cvtpd_epi8_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector((__v2df)a, __v2qi); > +} > + > +__v4qi mm256_cvtpd_epi8_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector((__v4df)a, __v4qi); > +} > + > +__v8qi mm512_cvtpd_epi8_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector((__v8df)a, __v8qi); > +} > + > +__v2qu mm_cvtpd_epu8_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector((__v2df)a, __v2qu); > +} > + > +__v4qu mm256_cvtpd_epu8_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector((__v4df)a, __v4qu); > +} > + > +__v8qu mm512_cvtpd_epu8_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector((__v8df)a, __v8qu); > +} > + > +__v2qi mm64_cvtps_epi8_builtin_convertvector(__v2sf a) > +{ > + return __builtin_convertvector((__v2sf)a, __v2qi); > +} > + > +__v4qi mm128_cvtps_epi8_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector((__v4sf)a, __v4qi); > +} > + > +__v8qi mm256_cvtps_epi8_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector((__v8sf)a, __v8qi); > +} > + > +__v16qi mm512_cvtps_epi8_builtin_convertvector(__v16sf a) > +{ > + return __builtin_convertvector((__v16sf)a, __v16qi); > +} > + > +__v2qu mm64_cvtps_epu8_builtin_convertvector(__v2sf a) > +{ > + return __builtin_convertvector((__v2sf)a, __v2qu); > +} > + > +__v4qu mm128_cvtps_epu8_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector((__v4sf)a, __v4qu); > +} > + > +__v8qu mm256_cvtps_epu8_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector((__v8sf)a, __v8qu); > +} > + > +__v16qu mm512_cvtps_epu8_builtin_convertvector(__v16sf a) > +{ > + return __builtin_convertvector((__v16sf)a, __v16qu); > +} > + > +__v2qi mm32_cvtph_epi8_builtin_convertvector(__v2hf a) > +{ > + return __builtin_convertvector((__v2hf)a, __v2qi); > +} > + > +__v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector((__v8hf)a, __v8qi); > +} > + > +__v16qi mm256_cvtph_epi8_builtin_convertvector(__v16hf a) > +{ > + return __builtin_convertvector((__v16hf)a, __v16qi); > +} > + > +__v32qi mm512_cvtph_epi8_builtin_convertvector(__v32hf a) > +{ > + return __builtin_convertvector((__v32hf)a, __v32qi); > +} > + > +__v2qu mm32_cvtph_epu8_builtin_convertvector(__v2hf a) > +{ > + return __builtin_convertvector((__v2hf)a, __v2qu); > +} > + > +__v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector((__v8hf)a, __v8qu); > +} > + > +__v16qu mm256_cvtph_epu8_builtin_convertvector(__v16hf a) > +{ > + return __builtin_convertvector((__v16hf)a, __v16qu); > +} > + > +__v32qu mm512_cvtph_epu8_builtin_convertvector(__v32hf a) > +{ > + return __builtin_convertvector((__v32hf)a, __v32qu); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c > new file mode 100644 > index 00000000000..0ff5a97ed1a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c > @@ -0,0 +1,156 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */ > +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */ > + > +#include <x86intrin.h> > + > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > +typedef char __v16qi __attribute__ ((__vector_size__ (16))); > +typedef unsigned char __v2qu __attribute__ ((vector_size (2))); > +typedef unsigned char __v4qu __attribute__ ((vector_size (4))); > +typedef unsigned char __v8qu __attribute__ ((vector_size (8))); > +typedef unsigned char __v16qu __attribute__ ((vector_size (16))); > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); > + > +__v2df mm_cvtepi8_pd_builtin_convertvector(__v2qi a) > +{ > + return __builtin_convertvector((__v2qi)a, __v2df); > +} > + > +__v4df mm256_cvtepi8_pd_builtin_convertvector(__v4qi a) > +{ > + return __builtin_convertvector((__v4qi)a, __v4df); > +} > + > +__v8df mm512_cvtepi8_pd_builtin_convertvector(__v8qi a) > +{ > + return __builtin_convertvector((__v8qi)a, __v8df); > +} > + > +__v2df mm_cvtepu8_pd_builtin_convertvector(__v2qu a) > +{ > + return __builtin_convertvector((__v2qu)a, __v2df); > +} > + > +__v4df mm256_cvtepu8_pd_builtin_convertvector(__v4qu a) > +{ > + return __builtin_convertvector((__v4qu)a, __v4df); > +} > + > +__v8df mm512_cvtepu8_pd_builtin_convertvector(__v8qu a) > +{ > + return __builtin_convertvector((__v8qu)a, __v8df); > +} > + > +__v2sf mm64_cvtepi8_ps_builtin_convertvector(__v2qi a) > +{ > + return __builtin_convertvector((__v2qi)a, __v2sf); > +} > + > +__v4sf mm128_cvtepi8_ps_builtin_convertvector(__v4qi a) > +{ > + return __builtin_convertvector((__v4qi)a, __v4sf); > +} > + > +__v8sf mm256_cvtepi8_ps_builtin_convertvector(__v8qi a) > +{ > + return __builtin_convertvector((__v8qi)a, __v8sf); > +} > + > +__v16sf mm512_cvtepi8_ps_builtin_convertvector(__v16qi a) > +{ > + return __builtin_convertvector((__v16qi)a, __v16sf); > +} > + > +__v2sf mm64_cvtepu8_ps_builtin_convertvector(__v2qu a) > +{ > + return __builtin_convertvector((__v2qu)a, __v2sf); > +} > + > +__v4sf mm128_cvtepu8_ps_builtin_convertvector(__v4qu a) > +{ > + return __builtin_convertvector((__v4qu)a, __v4sf); > +} > + > +__v8sf mm256_cvtepu8_ps_builtin_convertvector(__v8qu a) > +{ > + return __builtin_convertvector((__v8qu)a, __v8sf); > +} > + > +__v16sf mm512_cvtepu8_ps_builtin_convertvector(__v16qu a) > +{ > + return __builtin_convertvector((__v16qu)a, __v16sf); > +} > + > +__v2hf mm32_cvtepi8_ph_builtin_convertvector(__v2qi a) > +{ > + return __builtin_convertvector((__v2qi)a, __v2hf); > +} > + > +__v4hf mm64_cvtepi8_ph_builtin_convertvector(__v4qi a) > +{ > + return __builtin_convertvector((__v4qi)a, __v4hf); > +} > + > +__v8hf mm128_cvtepi8_ph_builtin_convertvector(__v8qi a) > +{ > + return __builtin_convertvector((__v8qi)a, __v8hf); > +} > + > +__v16hf mm256_cvtepi8_ph_builtin_convertvector(__v16qi a) > +{ > + return __builtin_convertvector((__v16qi)a, __v16hf); > +} > + > +__v32hf mm512_cvtepi8_ph_builtin_convertvector(__v32qi a) > +{ > + return __builtin_convertvector((__v32qi)a, __v32hf); > +} > + > +__v2hf mm32_cvtepu8_ph_builtin_convertvector(__v2qu a) > +{ > + return __builtin_convertvector((__v2qu)a, __v2hf); > +} > + > +__v4hf mm64_cvtepu8_ph_builtin_convertvector(__v4qu a) > +{ > + return __builtin_convertvector((__v4qu)a, __v4hf); > +} > + > +__v8hf mm128_cvtepu8_ph_builtin_convertvector(__v8qu a) > +{ > + return __builtin_convertvector((__v8qu)a, __v8hf); > +} > + > +__v16hf mm256_cvtepu8_ph_builtin_convertvector(__v16qu a) > +{ > + return __builtin_convertvector((__v16qu)a, __v16hf); > +} > + > +__v32hf mm512_cvtepu8_ph_builtin_convertvector(__v32qu a) > +{ > + return __builtin_convertvector((__v32qu)a, __v32hf); > +} > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc > index ab640096ca2..0bedb53d9f9 100644 > --- a/gcc/tree-vect-generic.cc > +++ b/gcc/tree-vect-generic.cc > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see > #include "gimple-match.h" > #include "recog.h" /* FIXME: for insn_data */ > #include "optabs-libfuncs.h" > +#include "cfgloop.h" > +#include "tree-vectorizer.h" > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a, > return gimplify_build2 (gsi, code, outer_type, b, c); > } > > +/* A subroutine of expand_vector_conversion, support indirect conversion for > + float <-> int, like double -> char. */ > +bool > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > + enum tree_code code, > + tree lhs, > + tree arg) > +{ > + gimple *g; > + tree ret_type = TREE_TYPE (lhs); > + tree arg_type = TREE_TYPE (arg); > + tree new_rhs; > + > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > + unsigned int arg_elt_bits = vector_element_bits (arg_type); > + if (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= arg_elt_bits) > + return false; > + > + unsigned short target_size; > + scalar_mode tmp_cvt_mode; > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); > + tree cvt_type = NULL_TREE; > + tmp_cvt_mode = lhs_mode; > + target_size = GET_MODE_SIZE (rhs_mode); > + > + opt_scalar_mode mode_iter; > + enum tree_code tc1, tc2; > + unsigned HOST_WIDE_INT nelts > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > + > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > + { > + tmp_cvt_mode = mode_iter.require (); > + > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > + break; > + > + scalar_mode cvt_mode; > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > + break; > + > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type); > + cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned); > + > + cvt_type = build_vector_type (cvt_type, nelts); > + if (cvt_type == NULL_TREE > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > + ret_type, > + cvt_type, &tc1) > + || !supportable_convert_operation ((tree_code) code, > + cvt_type, > + arg_type, &tc2)) > + continue; > + > + new_rhs = make_ssa_name (cvt_type); > + g = vect_gimple_build (new_rhs, tc2, arg); > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > + g = gimple_build_assign (lhs, tc1, new_rhs); > + gsi_replace (gsi, g, false); > + return true; > + } > + return false; > +} > + > +/* A subroutine of expand_vector_conversion, support indirect conversion for > + float <-> int, like char -> double. */ > +bool > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi, > + enum tree_code code, > + tree lhs, > + tree arg) > +{ > + gimple *g; > + tree ret_type = TREE_TYPE (lhs); > + tree arg_type = TREE_TYPE (arg); > + tree new_rhs; > + > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > + unsigned int arg_elt_bits = vector_element_bits (arg_type); > + if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR) > + return false; > + > + unsigned short target_size; > + scalar_mode tmp_cvt_mode; > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); > + tree cvt_type = NULL_TREE; > + target_size = GET_MODE_SIZE (lhs_mode); > + int rhs_size = GET_MODE_BITSIZE (rhs_mode); > + if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode)) > + return false; > + > + opt_scalar_mode mode_iter; > + enum tree_code tc1, tc2; > + unsigned HOST_WIDE_INT nelts > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > + > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > + { > + tmp_cvt_mode = mode_iter.require (); > + > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > + break; > + > + scalar_mode cvt_mode; > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > + break; > + > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type); > + cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned); > + > + cvt_type = build_vector_type (cvt_type, nelts); > + if (cvt_type == NULL_TREE > + || !supportable_convert_operation ((tree_code) code, > + ret_type, > + cvt_type, &tc1) > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > + cvt_type, > + arg_type, &tc2)) > + continue; > + > + new_rhs = make_ssa_name (cvt_type); > + g = vect_gimple_build (new_rhs, tc2, arg); > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > + g = gimple_build_assign (lhs, tc1, new_rhs); > + gsi_replace (gsi, g, false); > + return true; > + } > + return false; > +} > + So the above improve the situation where the target can handle the two-step conversion. It doesn't really allow this to work for too large vectors AFAICS (nor does it try pack/unpack for any of the conversions). It also still duplicates code that's in the vectorizer. I think you should be able to use supportable_narrowing_operation and possibly even supportable_widening_operation (though that needs refatoring to avoid the vectorizer internal stmt_vec_info type - possibly simply by gating the respective code on a non-NULL vinfo). Both support multi-step conversions. > /* Expand VEC_CONVERT ifn call. */ > > static void > @@ -1871,14 +2009,21 @@ expand_vector_conversion (gimple_stmt_iterator *gsi) > else if (ret_elt_bits > arg_elt_bits) > modifier = WIDEN; > > + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > + { > + g = gimple_build_assign (lhs, code1, arg); > + gsi_replace (gsi, g, false); > + return; > + } > + > + if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg)) > + return; > + > + if (supportable_indirect_widening_operation(gsi, code, lhs, arg)) > + return; > + > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR)) > { > - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > - { > - g = gimple_build_assign (lhs, code1, arg); > - gsi_replace (gsi, g, false); > - return; > - } > /* Can't use get_compute_type here, as supportable_convert_operation > doesn't necessarily use an optab and needs two arguments. */ > tree vec_compute_type >
> -----Original Message----- > From: Richard Biener <rguenther@suse.de> > Sent: Wednesday, May 29, 2024 5:41 PM > To: Hu, Lin1 <lin1.hu@intel.com> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > ubizjak@gmail.com > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float > -> float and int <-> float. > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > gcc/ChangeLog: > > > > PR target/107432 > > * tree-vect-generic.cc > > (supportable_indirect_narrowing_operation): New function for > > support indirect narrowing convert. > > (supportable_indirect_widening_operation): New function for > > support indirect widening convert. > > (expand_vector_conversion): Support convert for int -> int, > > float -> float and int <-> float. > > > > gcc/testsuite/ChangeLog: > > > > PR target/107432 > > * gcc.target/i386/pr107432-1.c: New test. > > * gcc.target/i386/pr107432-2.c: Ditto. > > * gcc.target/i386/pr107432-3.c: Ditto. > > * gcc.target/i386/pr107432-4.c: Ditto. > > * gcc.target/i386/pr107432-5.c: Ditto. > > * gcc.target/i386/pr107432-6.c: Ditto. > > * gcc.target/i386/pr107432-7.c: Ditto. > > --- > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index > > ab640096ca2..0bedb53d9f9 100644 > > --- a/gcc/tree-vect-generic.cc > > +++ b/gcc/tree-vect-generic.cc > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see > > #include "gimple-match.h" > > #include "recog.h" /* FIXME: for insn_data */ > > #include "optabs-libfuncs.h" > > +#include "cfgloop.h" > > +#include "tree-vectorizer.h" > > > > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > > return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > +/* A subroutine of expand_vector_conversion, support indirect conversion > for > > + float <-> int, like double -> char. */ bool > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > > + enum tree_code code, > > + tree lhs, > > + tree arg) > > +{ > > + gimple *g; > > + tree ret_type = TREE_TYPE (lhs); > > + tree arg_type = TREE_TYPE (arg); > > + tree new_rhs; > > + > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= > arg_elt_bits) > > + return false; > > + > > + unsigned short target_size; > > + scalar_mode tmp_cvt_mode; > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); tree > > + cvt_type = NULL_TREE; tmp_cvt_mode = lhs_mode; target_size = > > + GET_MODE_SIZE (rhs_mode); > > + > > + opt_scalar_mode mode_iter; > > + enum tree_code tc1, tc2; > > + unsigned HOST_WIDE_INT nelts > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > + > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > + { > > + tmp_cvt_mode = mode_iter.require (); > > + > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > + break; > > + > > + scalar_mode cvt_mode; > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > + break; > > + > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > (arg_type); > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > + isUnsigned); > > + > > + cvt_type = build_vector_type (cvt_type, nelts); > > + if (cvt_type == NULL_TREE > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > + ret_type, > > + cvt_type, &tc1) > > + || !supportable_convert_operation ((tree_code) code, > > + cvt_type, > > + arg_type, &tc2)) > > + continue; > > + > > + new_rhs = make_ssa_name (cvt_type); > > + g = vect_gimple_build (new_rhs, tc2, arg); > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > + gsi_replace (gsi, g, false); > > + return true; > > + } > > + return false; > > +} > > + > > +/* A subroutine of expand_vector_conversion, support indirect conversion > for > > + float <-> int, like char -> double. */ bool > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi, > > + enum tree_code code, > > + tree lhs, > > + tree arg) > > +{ > > + gimple *g; > > + tree ret_type = TREE_TYPE (lhs); > > + tree arg_type = TREE_TYPE (arg); > > + tree new_rhs; > > + > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR) > > + return false; > > + > > + unsigned short target_size; > > + scalar_mode tmp_cvt_mode; > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); tree > > + cvt_type = NULL_TREE; target_size = GET_MODE_SIZE (lhs_mode); int > > + rhs_size = GET_MODE_BITSIZE (rhs_mode); if (!int_mode_for_size > > + (rhs_size, 0).exists (&tmp_cvt_mode)) > > + return false; > > + > > + opt_scalar_mode mode_iter; > > + enum tree_code tc1, tc2; > > + unsigned HOST_WIDE_INT nelts > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > + > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > + { > > + tmp_cvt_mode = mode_iter.require (); > > + > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > + break; > > + > > + scalar_mode cvt_mode; > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > + break; > > + > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > (arg_type); > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > + isUnsigned); > > + > > + cvt_type = build_vector_type (cvt_type, nelts); > > + if (cvt_type == NULL_TREE > > + || !supportable_convert_operation ((tree_code) code, > > + ret_type, > > + cvt_type, &tc1) > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > + cvt_type, > > + arg_type, &tc2)) > > + continue; > > + > > + new_rhs = make_ssa_name (cvt_type); > > + g = vect_gimple_build (new_rhs, tc2, arg); > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > + gsi_replace (gsi, g, false); > > + return true; > > + } > > + return false; > > +} > > + > > So the above improve the situation where the target can handle the two-step > conversion. It doesn't really allow this to work for too large vectors AFAICS (nor > does it try pack/unpack for any of the conversions). It also still duplicates code > that's in the vectorizer. I think you should be able to use > supportable_narrowing_operation and possibly even > supportable_widening_operation (though that needs refatoring to avoid the > vectorizer internal stmt_vec_info type - possibly simply by gating the respective > code on a non-NULL vinfo). Both support multi-step conversions. > I tried to use supportable_narrowing_operation and I met two questions: 1) supportable_narrowing_operation support v2df->v16qi, but I don't know which optab can help me convert v16qi to v2qi. 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is not what I expected, because it only use vec_pack_trunc. I expect it can use vcvttpd2dq + vpmovdw. If I can solve the first question and the function be better (maybe support trunc<vectype_in><vectype_out>), I'd be happy to use it directly. I prefer my scheme for now. My functions is more like supportable_convert_operation. Perhaps, we can modify supportable_narrowing_operation, but I think it should be another patch, it will influence vectorizer. BRs, Lin > > > /* Expand VEC_CONVERT ifn call. */ > > > > static void > > @@ -1871,14 +2009,21 @@ expand_vector_conversion > (gimple_stmt_iterator *gsi) > > else if (ret_elt_bits > arg_elt_bits) > > modifier = WIDEN; > > > > + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > + { > > + g = gimple_build_assign (lhs, code1, arg); > > + gsi_replace (gsi, g, false); > > + return; > > + } > > + > > + if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg)) > > + return; > > + > > + if (supportable_indirect_widening_operation(gsi, code, lhs, arg)) > > + return; > > + > > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > FLOAT_EXPR)) > > { > > - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > - { > > - g = gimple_build_assign (lhs, code1, arg); > > - gsi_replace (gsi, g, false); > > - return; > > - } > > /* Can't use get_compute_type here, as supportable_convert_operation > > doesn't necessarily use an optab and needs two arguments. */ > > tree vec_compute_type > > > > -- > Richard Biener <rguenther@suse.de> > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
On Fri, 31 May 2024, Hu, Lin1 wrote: > > -----Original Message----- > > From: Richard Biener <rguenther@suse.de> > > Sent: Wednesday, May 29, 2024 5:41 PM > > To: Hu, Lin1 <lin1.hu@intel.com> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > > ubizjak@gmail.com > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float > > -> float and int <-> float. > > > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > > > gcc/ChangeLog: > > > > > > PR target/107432 > > > * tree-vect-generic.cc > > > (supportable_indirect_narrowing_operation): New function for > > > support indirect narrowing convert. > > > (supportable_indirect_widening_operation): New function for > > > support indirect widening convert. > > > (expand_vector_conversion): Support convert for int -> int, > > > float -> float and int <-> float. > > > > > > gcc/testsuite/ChangeLog: > > > > > > PR target/107432 > > > * gcc.target/i386/pr107432-1.c: New test. > > > * gcc.target/i386/pr107432-2.c: Ditto. > > > * gcc.target/i386/pr107432-3.c: Ditto. > > > * gcc.target/i386/pr107432-4.c: Ditto. > > > * gcc.target/i386/pr107432-5.c: Ditto. > > > * gcc.target/i386/pr107432-6.c: Ditto. > > > * gcc.target/i386/pr107432-7.c: Ditto. > > > --- > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index > > > ab640096ca2..0bedb53d9f9 100644 > > > --- a/gcc/tree-vect-generic.cc > > > +++ b/gcc/tree-vect-generic.cc > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see > > > #include "gimple-match.h" > > > #include "recog.h" /* FIXME: for insn_data */ > > > #include "optabs-libfuncs.h" > > > +#include "cfgloop.h" > > > +#include "tree-vectorizer.h" > > > > > > > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > > > return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > > > +/* A subroutine of expand_vector_conversion, support indirect conversion > > for > > > + float <-> int, like double -> char. */ bool > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > > > + enum tree_code code, > > > + tree lhs, > > > + tree arg) > > > +{ > > > + gimple *g; > > > + tree ret_type = TREE_TYPE (lhs); > > > + tree arg_type = TREE_TYPE (arg); > > > + tree new_rhs; > > > + > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= > > arg_elt_bits) > > > + return false; > > > + > > > + unsigned short target_size; > > > + scalar_mode tmp_cvt_mode; > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); tree > > > + cvt_type = NULL_TREE; tmp_cvt_mode = lhs_mode; target_size = > > > + GET_MODE_SIZE (rhs_mode); > > > + > > > + opt_scalar_mode mode_iter; > > > + enum tree_code tc1, tc2; > > > + unsigned HOST_WIDE_INT nelts > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > + > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > + { > > > + tmp_cvt_mode = mode_iter.require (); > > > + > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > + break; > > > + > > > + scalar_mode cvt_mode; > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > + break; > > > + > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > > (arg_type); > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > + isUnsigned); > > > + > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > + if (cvt_type == NULL_TREE > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > + ret_type, > > > + cvt_type, &tc1) > > > + || !supportable_convert_operation ((tree_code) code, > > > + cvt_type, > > > + arg_type, &tc2)) > > > + continue; > > > + > > > + new_rhs = make_ssa_name (cvt_type); > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > + gsi_replace (gsi, g, false); > > > + return true; > > > + } > > > + return false; > > > +} > > > + > > > +/* A subroutine of expand_vector_conversion, support indirect conversion > > for > > > + float <-> int, like char -> double. */ bool > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi, > > > + enum tree_code code, > > > + tree lhs, > > > + tree arg) > > > +{ > > > + gimple *g; > > > + tree ret_type = TREE_TYPE (lhs); > > > + tree arg_type = TREE_TYPE (arg); > > > + tree new_rhs; > > > + > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR) > > > + return false; > > > + > > > + unsigned short target_size; > > > + scalar_mode tmp_cvt_mode; > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); tree > > > + cvt_type = NULL_TREE; target_size = GET_MODE_SIZE (lhs_mode); int > > > + rhs_size = GET_MODE_BITSIZE (rhs_mode); if (!int_mode_for_size > > > + (rhs_size, 0).exists (&tmp_cvt_mode)) > > > + return false; > > > + > > > + opt_scalar_mode mode_iter; > > > + enum tree_code tc1, tc2; > > > + unsigned HOST_WIDE_INT nelts > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > + > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > + { > > > + tmp_cvt_mode = mode_iter.require (); > > > + > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > + break; > > > + > > > + scalar_mode cvt_mode; > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > + break; > > > + > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > > (arg_type); > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > + isUnsigned); > > > + > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > + if (cvt_type == NULL_TREE > > > + || !supportable_convert_operation ((tree_code) code, > > > + ret_type, > > > + cvt_type, &tc1) > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > + cvt_type, > > > + arg_type, &tc2)) > > > + continue; > > > + > > > + new_rhs = make_ssa_name (cvt_type); > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > + gsi_replace (gsi, g, false); > > > + return true; > > > + } > > > + return false; > > > +} > > > + > > > > So the above improve the situation where the target can handle the two-step > > conversion. It doesn't really allow this to work for too large vectors AFAICS (nor > > does it try pack/unpack for any of the conversions). It also still duplicates code > > that's in the vectorizer. I think you should be able to use > > supportable_narrowing_operation and possibly even > > supportable_widening_operation (though that needs refatoring to avoid the > > vectorizer internal stmt_vec_info type - possibly simply by gating the respective > > code on a non-NULL vinfo). Both support multi-step conversions. > > > > I tried to use supportable_narrowing_operation and I met two questions: > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know > which optab can help me convert v16qi to v2qi. It's API is a bit tricky but for v2df -> v2qi (I expect you'll have an equal number of lanes in/out for .CONVERT_VECTOR) it likely outputs a multi-step conversion where you have to look into *INTERM_TYPES and second-guess the operation code to use for the intermediate steps (IIRC the intermediate steps all use either PACK/UNPACK or CONVERT, never FLOAT/FIX). > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is > not what I expected, because it only use vec_pack_trunc. I expect it > can use vcvttpd2dq + vpmovdw. With -O3 -fno-tree-loop-vectorize that's what you get. What you see is because of the restriction of the loop vectorizer to work on a single vector size only. > If I can solve the first question and the function be better (maybe > support trunc<vectype_in><vectype_out>), I'd be happy to use it > directly. I prefer my scheme for now. My functions is more like > supportable_convert_operation. Perhaps, we can modify > supportable_narrowing_operation, but I think it should be another patch, > it will influence vectorizer. But since you are doing a multi-step conversion this is really what supportable_narrowing_operation is about. I don't think we want to re-invent the wheel here. Likewise your approach won't get you to use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current single-step .CONVERT_VECTOR lowering). supportable_narrowing_operation also checks for this. Richard. > BRs, > Lin > > > > > > /* Expand VEC_CONVERT ifn call. */ > > > > > > static void > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion > > (gimple_stmt_iterator *gsi) > > > else if (ret_elt_bits > arg_elt_bits) > > > modifier = WIDEN; > > > > > > + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > > + { > > > + g = gimple_build_assign (lhs, code1, arg); > > > + gsi_replace (gsi, g, false); > > > + return; > > > + } > > > + > > > + if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg)) > > > + return; > > > + > > > + if (supportable_indirect_widening_operation(gsi, code, lhs, arg)) > > > + return; > > > + > > > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > > FLOAT_EXPR)) > > > { > > > - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > > - { > > > - g = gimple_build_assign (lhs, code1, arg); > > > - gsi_replace (gsi, g, false); > > > - return; > > > - } > > > /* Can't use get_compute_type here, as supportable_convert_operation > > > doesn't necessarily use an optab and needs two arguments. */ > > > tree vec_compute_type > > > > > > > -- > > Richard Biener <rguenther@suse.de> > > SUSE Software Solutions Germany GmbH, > > Frankenstrasse 146, 90461 Nuernberg, Germany; > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg) >
> -----Original Message----- > From: Richard Biener <rguenther@suse.de> > Sent: Friday, May 31, 2024 8:41 PM > To: Hu, Lin1 <lin1.hu@intel.com> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > ubizjak@gmail.com > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float > -> float and int <-> float. > > On Fri, 31 May 2024, Hu, Lin1 wrote: > > > > -----Original Message----- > > > From: Richard Biener <rguenther@suse.de> > > > Sent: Wednesday, May 29, 2024 5:41 PM > > > To: Hu, Lin1 <lin1.hu@intel.com> > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > > > ubizjak@gmail.com > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for > > > int -> int, float > > > -> float and int <-> float. > > > > > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > > > > > gcc/ChangeLog: > > > > > > > > PR target/107432 > > > > * tree-vect-generic.cc > > > > (supportable_indirect_narrowing_operation): New function for > > > > support indirect narrowing convert. > > > > (supportable_indirect_widening_operation): New function for > > > > support indirect widening convert. > > > > (expand_vector_conversion): Support convert for int -> int, > > > > float -> float and int <-> float. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > PR target/107432 > > > > * gcc.target/i386/pr107432-1.c: New test. > > > > * gcc.target/i386/pr107432-2.c: Ditto. > > > > * gcc.target/i386/pr107432-3.c: Ditto. > > > > * gcc.target/i386/pr107432-4.c: Ditto. > > > > * gcc.target/i386/pr107432-5.c: Ditto. > > > > * gcc.target/i386/pr107432-6.c: Ditto. > > > > * gcc.target/i386/pr107432-7.c: Ditto. > > > > --- > > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc > > > > index > > > > ab640096ca2..0bedb53d9f9 100644 > > > > --- a/gcc/tree-vect-generic.cc > > > > +++ b/gcc/tree-vect-generic.cc > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not > > > > see #include "gimple-match.h" > > > > #include "recog.h" /* FIXME: for insn_data */ > > > > #include "optabs-libfuncs.h" > > > > +#include "cfgloop.h" > > > > +#include "tree-vectorizer.h" > > > > > > > > > > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > > > > return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > +conversion > > > for > > > > + float <-> int, like double -> char. */ bool > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > > > > + enum tree_code code, > > > > + tree lhs, > > > > + tree arg) > > > > +{ > > > > + gimple *g; > > > > + tree ret_type = TREE_TYPE (lhs); > > > > + tree arg_type = TREE_TYPE (arg); > > > > + tree new_rhs; > > > > + > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= > > > arg_elt_bits) > > > > + return false; > > > > + > > > > + unsigned short target_size; > > > > + scalar_mode tmp_cvt_mode; > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); > > > > + tree cvt_type = NULL_TREE; tmp_cvt_mode = lhs_mode; > > > > + target_size = GET_MODE_SIZE (rhs_mode); > > > > + > > > > + opt_scalar_mode mode_iter; > > > > + enum tree_code tc1, tc2; > > > > + unsigned HOST_WIDE_INT nelts > > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > + > > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > > + { > > > > + tmp_cvt_mode = mode_iter.require (); > > > > + > > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > > + break; > > > > + > > > > + scalar_mode cvt_mode; > > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > > + break; > > > > + > > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > > > (arg_type); > > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > > + isUnsigned); > > > > + > > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > > + if (cvt_type == NULL_TREE > > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > > + ret_type, > > > > + cvt_type, &tc1) > > > > + || !supportable_convert_operation ((tree_code) code, > > > > + cvt_type, > > > > + arg_type, &tc2)) > > > > + continue; > > > > + > > > > + new_rhs = make_ssa_name (cvt_type); > > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > > + gsi_replace (gsi, g, false); > > > > + return true; > > > > + } > > > > + return false; > > > > +} > > > > + > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > +conversion > > > for > > > > + float <-> int, like char -> double. */ bool > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi, > > > > + enum tree_code code, > > > > + tree lhs, > > > > + tree arg) > > > > +{ > > > > + gimple *g; > > > > + tree ret_type = TREE_TYPE (lhs); > > > > + tree arg_type = TREE_TYPE (arg); > > > > + tree new_rhs; > > > > + > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR) > > > > + return false; > > > > + > > > > + unsigned short target_size; > > > > + scalar_mode tmp_cvt_mode; > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); > > > > + tree cvt_type = NULL_TREE; target_size = GET_MODE_SIZE > > > > + (lhs_mode); int rhs_size = GET_MODE_BITSIZE (rhs_mode); if > > > > + (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode)) > > > > + return false; > > > > + > > > > + opt_scalar_mode mode_iter; > > > > + enum tree_code tc1, tc2; > > > > + unsigned HOST_WIDE_INT nelts > > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > + > > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > > + { > > > > + tmp_cvt_mode = mode_iter.require (); > > > > + > > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > > + break; > > > > + > > > > + scalar_mode cvt_mode; > > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > > + break; > > > > + > > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > > > (arg_type); > > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > > + isUnsigned); > > > > + > > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > > + if (cvt_type == NULL_TREE > > > > + || !supportable_convert_operation ((tree_code) code, > > > > + ret_type, > > > > + cvt_type, &tc1) > > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > > + cvt_type, > > > > + arg_type, &tc2)) > > > > + continue; > > > > + > > > > + new_rhs = make_ssa_name (cvt_type); > > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > > + gsi_replace (gsi, g, false); > > > > + return true; > > > > + } > > > > + return false; > > > > +} > > > > + > > > > > > So the above improve the situation where the target can handle the > > > two-step conversion. It doesn't really allow this to work for too > > > large vectors AFAICS (nor does it try pack/unpack for any of the > > > conversions). It also still duplicates code that's in the > > > vectorizer. I think you should be able to use > > > supportable_narrowing_operation and possibly even > > > supportable_widening_operation (though that needs refatoring to > > > avoid the vectorizer internal stmt_vec_info type - possibly simply by gating > the respective code on a non-NULL vinfo). Both support multi-step conversions. > > > > > > > I tried to use supportable_narrowing_operation and I met two questions: > > > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know > > which optab can help me convert v16qi to v2qi. > > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have an equal number of > lanes in/out for .CONVERT_VECTOR) it likely outputs a multi-step conversion > where you have to look into *INTERM_TYPES and second-guess the operation > code to use for the intermediate steps (IIRC the intermediate steps all use either > PACK/UNPACK or CONVERT, never FLOAT/FIX). > I made a mistake in what I said before. I think supportable_narrowing_operation doesn't support v2df->v2qi, it only use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it require that vectype_in and vectype_out have the same size to return true. I want to make sure I'm doing the right thing, I can build a tmp_type by build_nonstandard_integer_type and get_same_sized_vectype. And use tree_vec_extract to extract v2qi from v16qi after supportable_narrowing_operation. > > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is > > not what I expected, because it only use vec_pack_trunc. I expect it > > can use vcvttpd2dq + vpmovdw. > > With -O3 -fno-tree-loop-vectorize that's what you get. What you see is because > of the restriction of the loop vectorizer to work on a single vector size only. > Yes, it works, but the program runs the NONE part (tree-vect-stmts.cc:5357) instead of the NARROW_DST part (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the code from line:5373 to line:5455 as a function. This avoids duplicating the wheel, and I get the results I'm looking for. In addition to wrapping the function. If you are motivated by the fact that our modifications are not generalized enough, I think we can add supportable_narrowing/widening_operation after the current single step VEC_CONVERT (line 1972 and line 2078). It should try to use a single step and then use multiple steps. If you agree, I'd like to remove my changes about indirect conversions for now, and keep only the direct conversions, so that I can merge the three current patches into the trunk first, and then add the change about indirect conversions later. BRs, Lin > > > If I can solve the first question and the function be better (maybe > > support trunc<vectype_in><vectype_out>), I'd be happy to use it > > directly. I prefer my scheme for now. My functions is more like > > supportable_convert_operation. Perhaps, we can modify > > supportable_narrowing_operation, but I think it should be another > > patch, it will influence vectorizer. > > But since you are doing a multi-step conversion this is really what > supportable_narrowing_operation is about. I don't think we want to re-invent > the wheel here. Likewise your approach won't get you to use > VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current single- > step .CONVERT_VECTOR lowering). > supportable_narrowing_operation also checks for this. > > Richard. > > > > BRs, > > Lin > > > > > > > > > /* Expand VEC_CONVERT ifn call. */ > > > > > > > > static void > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion > > > (gimple_stmt_iterator *gsi) > > > > else if (ret_elt_bits > arg_elt_bits) > > > > modifier = WIDEN; > > > > > > > > + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > > > + { > > > > + g = gimple_build_assign (lhs, code1, arg); > > > > + gsi_replace (gsi, g, false); > > > > + return; > > > > + } > > > > + > > > > + if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg)) > > > > + return; > > > > + > > > > + if (supportable_indirect_widening_operation(gsi, code, lhs, arg)) > > > > + return; > > > > + > > > > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > > > FLOAT_EXPR)) > > > > { > > > > - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > > > - { > > > > - g = gimple_build_assign (lhs, code1, arg); > > > > - gsi_replace (gsi, g, false); > > > > - return; > > > > - } > > > > /* Can't use get_compute_type here, as > supportable_convert_operation > > > > doesn't necessarily use an optab and needs two arguments. */ > > > > tree vec_compute_type > > > > > > > > > > -- > > > Richard Biener <rguenther@suse.de> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 > > > Nuernberg, Germany; > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG > > > Nuernberg) > > > > -- > Richard Biener <rguenther@suse.de> > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
On Mon, 3 Jun 2024, Hu, Lin1 wrote: > > -----Original Message----- > > From: Richard Biener <rguenther@suse.de> > > Sent: Friday, May 31, 2024 8:41 PM > > To: Hu, Lin1 <lin1.hu@intel.com> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > > ubizjak@gmail.com > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float > > -> float and int <-> float. > > > > On Fri, 31 May 2024, Hu, Lin1 wrote: > > > > > > -----Original Message----- > > > > From: Richard Biener <rguenther@suse.de> > > > > Sent: Wednesday, May 29, 2024 5:41 PM > > > > To: Hu, Lin1 <lin1.hu@intel.com> > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > > > > ubizjak@gmail.com > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for > > > > int -> int, float > > > > -> float and int <-> float. > > > > > > > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > PR target/107432 > > > > > * tree-vect-generic.cc > > > > > (supportable_indirect_narrowing_operation): New function for > > > > > support indirect narrowing convert. > > > > > (supportable_indirect_widening_operation): New function for > > > > > support indirect widening convert. > > > > > (expand_vector_conversion): Support convert for int -> int, > > > > > float -> float and int <-> float. > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > PR target/107432 > > > > > * gcc.target/i386/pr107432-1.c: New test. > > > > > * gcc.target/i386/pr107432-2.c: Ditto. > > > > > * gcc.target/i386/pr107432-3.c: Ditto. > > > > > * gcc.target/i386/pr107432-4.c: Ditto. > > > > > * gcc.target/i386/pr107432-5.c: Ditto. > > > > > * gcc.target/i386/pr107432-6.c: Ditto. > > > > > * gcc.target/i386/pr107432-7.c: Ditto. > > > > > --- > > > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc > > > > > index > > > > > ab640096ca2..0bedb53d9f9 100644 > > > > > --- a/gcc/tree-vect-generic.cc > > > > > +++ b/gcc/tree-vect-generic.cc > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not > > > > > see #include "gimple-match.h" > > > > > #include "recog.h" /* FIXME: for insn_data */ > > > > > #include "optabs-libfuncs.h" > > > > > +#include "cfgloop.h" > > > > > +#include "tree-vectorizer.h" > > > > > > > > > > > > > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > > > > > return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > > > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > > +conversion > > > > for > > > > > + float <-> int, like double -> char. */ bool > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > > > > > + enum tree_code code, > > > > > + tree lhs, > > > > > + tree arg) > > > > > +{ > > > > > + gimple *g; > > > > > + tree ret_type = TREE_TYPE (lhs); > > > > > + tree arg_type = TREE_TYPE (arg); > > > > > + tree new_rhs; > > > > > + > > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= > > > > arg_elt_bits) > > > > > + return false; > > > > > + > > > > > + unsigned short target_size; > > > > > + scalar_mode tmp_cvt_mode; > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); > > > > > + tree cvt_type = NULL_TREE; tmp_cvt_mode = lhs_mode; > > > > > + target_size = GET_MODE_SIZE (rhs_mode); > > > > > + > > > > > + opt_scalar_mode mode_iter; > > > > > + enum tree_code tc1, tc2; > > > > > + unsigned HOST_WIDE_INT nelts > > > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > > + > > > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > > > + { > > > > > + tmp_cvt_mode = mode_iter.require (); > > > > > + > > > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > > > + break; > > > > > + > > > > > + scalar_mode cvt_mode; > > > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > > > + break; > > > > > + > > > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > > > > (arg_type); > > > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > > > + isUnsigned); > > > > > + > > > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > > > + if (cvt_type == NULL_TREE > > > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > > > + ret_type, > > > > > + cvt_type, &tc1) > > > > > + || !supportable_convert_operation ((tree_code) code, > > > > > + cvt_type, > > > > > + arg_type, &tc2)) > > > > > + continue; > > > > > + > > > > > + new_rhs = make_ssa_name (cvt_type); > > > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > > > + gsi_replace (gsi, g, false); > > > > > + return true; > > > > > + } > > > > > + return false; > > > > > +} > > > > > + > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > > +conversion > > > > for > > > > > + float <-> int, like char -> double. */ bool > > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi, > > > > > + enum tree_code code, > > > > > + tree lhs, > > > > > + tree arg) > > > > > +{ > > > > > + gimple *g; > > > > > + tree ret_type = TREE_TYPE (lhs); > > > > > + tree arg_type = TREE_TYPE (arg); > > > > > + tree new_rhs; > > > > > + > > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > > > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR) > > > > > + return false; > > > > > + > > > > > + unsigned short target_size; > > > > > + scalar_mode tmp_cvt_mode; > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); > > > > > + tree cvt_type = NULL_TREE; target_size = GET_MODE_SIZE > > > > > + (lhs_mode); int rhs_size = GET_MODE_BITSIZE (rhs_mode); if > > > > > + (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode)) > > > > > + return false; > > > > > + > > > > > + opt_scalar_mode mode_iter; > > > > > + enum tree_code tc1, tc2; > > > > > + unsigned HOST_WIDE_INT nelts > > > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > > + > > > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > > > + { > > > > > + tmp_cvt_mode = mode_iter.require (); > > > > > + > > > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > > > + break; > > > > > + > > > > > + scalar_mode cvt_mode; > > > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > > > + break; > > > > > + > > > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > > > > (arg_type); > > > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > > > + isUnsigned); > > > > > + > > > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > > > + if (cvt_type == NULL_TREE > > > > > + || !supportable_convert_operation ((tree_code) code, > > > > > + ret_type, > > > > > + cvt_type, &tc1) > > > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > > > + cvt_type, > > > > > + arg_type, &tc2)) > > > > > + continue; > > > > > + > > > > > + new_rhs = make_ssa_name (cvt_type); > > > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > > > + gsi_replace (gsi, g, false); > > > > > + return true; > > > > > + } > > > > > + return false; > > > > > +} > > > > > + > > > > > > > > So the above improve the situation where the target can handle the > > > > two-step conversion. It doesn't really allow this to work for too > > > > large vectors AFAICS (nor does it try pack/unpack for any of the > > > > conversions). It also still duplicates code that's in the > > > > vectorizer. I think you should be able to use > > > > supportable_narrowing_operation and possibly even > > > > supportable_widening_operation (though that needs refatoring to > > > > avoid the vectorizer internal stmt_vec_info type - possibly simply by gating > > the respective code on a non-NULL vinfo). Both support multi-step conversions. > > > > > > > > > > I tried to use supportable_narrowing_operation and I met two questions: > > > > > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know > > > which optab can help me convert v16qi to v2qi. > > > > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have an equal number of > > lanes in/out for .CONVERT_VECTOR) it likely outputs a multi-step conversion > > where you have to look into *INTERM_TYPES and second-guess the operation > > code to use for the intermediate steps (IIRC the intermediate steps all use either > > PACK/UNPACK or CONVERT, never FLOAT/FIX). > > > > I made a mistake in what I said before. I think > supportable_narrowing_operation doesn't support v2df->v2qi, it only use > VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it require > that vectype_in and vectype_out have the same size to return true. I > want to make sure I'm doing the right thing, I can build a tmp_type by > build_nonstandard_integer_type and get_same_sized_vectype. And use > tree_vec_extract to extract v2qi from v16qi after > supportable_narrowing_operation. Yes. It looks like the vectorizer, when the vector types number of lanes agree goes the 'NONE' conversion path, checks supportable_convert_operation and then has open-coded handling for /* For conversions between float and integer types try whether we can use intermediate signed integer types to support the conversion. */ that means I was wrong in indicating supportable_narrowing_operation was for element narrowing, it is for number-of-lane "narrowing". That said, vectorizable_conversion, in the NONE case has handling that should be split out into a function that's usable also from vector lowering then so that both vectorization and lowering handle the same cases. The interface would be similar to supportable_narrowing_operation. > > > > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is > > > not what I expected, because it only use vec_pack_trunc. I expect it > > > can use vcvttpd2dq + vpmovdw. > > > > With -O3 -fno-tree-loop-vectorize that's what you get. What you see is because > > of the restriction of the loop vectorizer to work on a single vector size only. > > > > Yes, it works, but the program runs the NONE part > (tree-vect-stmts.cc:5357) instead of the NARROW_DST part > (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the > code from line:5373 to line:5455 as a function. This avoids duplicating > the wheel, and I get the results I'm looking for. Yeah. > In addition to wrapping the function. If you are motivated by the fact > that our modifications are not generalized enough, I think we can add > supportable_narrowing/widening_operation after the current single step > VEC_CONVERT (line 1972 and line 2078). It should try to use a single > step and then use multiple steps. If you agree, I'd like to remove my > changes about indirect conversions for now, and keep only the direct > conversions, so that I can merge the three current patches into the > trunk first, and then add the change about indirect conversions later. I think it should go like finding the largest compute_vectype pair (source/destination) that we can handle either directly or indirectly via the new function. Richard. > BRs, > Lin > > > > > > If I can solve the first question and the function be better (maybe > > > support trunc<vectype_in><vectype_out>), I'd be happy to use it > > > directly. I prefer my scheme for now. My functions is more like > > > supportable_convert_operation. Perhaps, we can modify > > > supportable_narrowing_operation, but I think it should be another > > > patch, it will influence vectorizer. > > > > But since you are doing a multi-step conversion this is really what > > supportable_narrowing_operation is about. I don't think we want to re-invent > > the wheel here. Likewise your approach won't get you to use > > VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current single- > > step .CONVERT_VECTOR lowering). > > supportable_narrowing_operation also checks for this. > > > > Richard. > > > > > > > BRs, > > > Lin > > > > > > > > > > > > /* Expand VEC_CONVERT ifn call. */ > > > > > > > > > > static void > > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion > > > > (gimple_stmt_iterator *gsi) > > > > > else if (ret_elt_bits > arg_elt_bits) > > > > > modifier = WIDEN; > > > > > > > > > > + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > > > > + { > > > > > + g = gimple_build_assign (lhs, code1, arg); > > > > > + gsi_replace (gsi, g, false); > > > > > + return; > > > > > + } > > > > > + > > > > > + if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg)) > > > > > + return; > > > > > + > > > > > + if (supportable_indirect_widening_operation(gsi, code, lhs, arg)) > > > > > + return; > > > > > + > > > > > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > > > > FLOAT_EXPR)) > > > > > { > > > > > - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > > > > - { > > > > > - g = gimple_build_assign (lhs, code1, arg); > > > > > - gsi_replace (gsi, g, false); > > > > > - return; > > > > > - } > > > > > /* Can't use get_compute_type here, as > > supportable_convert_operation > > > > > doesn't necessarily use an optab and needs two arguments. */ > > > > > tree vec_compute_type > > > > > > > > > > > > > -- > > > > Richard Biener <rguenther@suse.de> > > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 > > > > Nuernberg, Germany; > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG > > > > Nuernberg) > > > > > > > -- > > Richard Biener <rguenther@suse.de> > > SUSE Software Solutions Germany GmbH, > > Frankenstrasse 146, 90461 Nuernberg, Germany; > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg) >
> -----Original Message----- > From: Richard Biener <rguenther@suse.de> > Sent: Monday, June 3, 2024 5:03 PM > To: Hu, Lin1 <lin1.hu@intel.com> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > ubizjak@gmail.com > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float > -> float and int <-> float. > > On Mon, 3 Jun 2024, Hu, Lin1 wrote: > > > > -----Original Message----- > > > From: Richard Biener <rguenther@suse.de> > > > Sent: Friday, May 31, 2024 8:41 PM > > > To: Hu, Lin1 <lin1.hu@intel.com> > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > > > ubizjak@gmail.com > > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for > > > int -> int, float > > > -> float and int <-> float. > > > > > > On Fri, 31 May 2024, Hu, Lin1 wrote: > > > > > > > > -----Original Message----- > > > > > From: Richard Biener <rguenther@suse.de> > > > > > Sent: Wednesday, May 29, 2024 5:41 PM > > > > > To: Hu, Lin1 <lin1.hu@intel.com> > > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao > > > > > <hongtao.liu@intel.com>; ubizjak@gmail.com > > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn > > > > > for int -> int, float > > > > > -> float and int <-> float. > > > > > > > > > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > PR target/107432 > > > > > > * tree-vect-generic.cc > > > > > > (supportable_indirect_narrowing_operation): New function for > > > > > > support indirect narrowing convert. > > > > > > (supportable_indirect_widening_operation): New function for > > > > > > support indirect widening convert. > > > > > > (expand_vector_conversion): Support convert for int -> int, > > > > > > float -> float and int <-> float. > > > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > > > PR target/107432 > > > > > > * gcc.target/i386/pr107432-1.c: New test. > > > > > > * gcc.target/i386/pr107432-2.c: Ditto. > > > > > > * gcc.target/i386/pr107432-3.c: Ditto. > > > > > > * gcc.target/i386/pr107432-4.c: Ditto. > > > > > > * gcc.target/i386/pr107432-5.c: Ditto. > > > > > > * gcc.target/i386/pr107432-6.c: Ditto. > > > > > > * gcc.target/i386/pr107432-7.c: Ditto. > > > > > > --- > > > > > > diff --git a/gcc/tree-vect-generic.cc > > > > > > b/gcc/tree-vect-generic.cc index > > > > > > ab640096ca2..0bedb53d9f9 100644 > > > > > > --- a/gcc/tree-vect-generic.cc > > > > > > +++ b/gcc/tree-vect-generic.cc > > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If > > > > > > not see #include "gimple-match.h" > > > > > > #include "recog.h" /* FIXME: for insn_data */ > > > > > > #include "optabs-libfuncs.h" > > > > > > +#include "cfgloop.h" > > > > > > +#include "tree-vectorizer.h" > > > > > > > > > > > > > > > > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > > > > > > return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > > > > > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > > > +conversion > > > > > for > > > > > > + float <-> int, like double -> char. */ bool > > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > > > > > > + enum tree_code code, > > > > > > + tree lhs, > > > > > > + tree arg) > > > > > > +{ > > > > > > + gimple *g; > > > > > > + tree ret_type = TREE_TYPE (lhs); > > > > > > + tree arg_type = TREE_TYPE (arg); > > > > > > + tree new_rhs; > > > > > > + > > > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); > > > > > > + if (code != FIX_TRUNC_EXPR || flag_trapping_math || > > > > > > + ret_elt_bits >= > > > > > arg_elt_bits) > > > > > > + return false; > > > > > > + > > > > > > + unsigned short target_size; scalar_mode tmp_cvt_mode; > > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE > > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER > > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE; > > > > > > + tmp_cvt_mode = lhs_mode; target_size = GET_MODE_SIZE > > > > > > + (rhs_mode); > > > > > > + > > > > > > + opt_scalar_mode mode_iter; > > > > > > + enum tree_code tc1, tc2; > > > > > > + unsigned HOST_WIDE_INT nelts > > > > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > > > + > > > > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > > > > + { > > > > > > + tmp_cvt_mode = mode_iter.require (); > > > > > > + > > > > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > > > > + break; > > > > > > + > > > > > > + scalar_mode cvt_mode; > > > > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > > > > + break; > > > > > > + > > > > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || > > > > > > + TYPE_UNSIGNED > > > > > (arg_type); > > > > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > > > > + isUnsigned); > > > > > > + > > > > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > > > > + if (cvt_type == NULL_TREE > > > > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > > > > + ret_type, > > > > > > + cvt_type, &tc1) > > > > > > + || !supportable_convert_operation ((tree_code) code, > > > > > > + cvt_type, > > > > > > + arg_type, &tc2)) > > > > > > + continue; > > > > > > + > > > > > > + new_rhs = make_ssa_name (cvt_type); > > > > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > > > > + gsi_replace (gsi, g, false); > > > > > > + return true; > > > > > > + } > > > > > > + return false; > > > > > > +} > > > > > > + > > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > > > +conversion > > > > > for > > > > > > + float <-> int, like char -> double. */ bool > > > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi, > > > > > > + enum tree_code code, > > > > > > + tree lhs, > > > > > > + tree arg) > > > > > > +{ > > > > > > + gimple *g; > > > > > > + tree ret_type = TREE_TYPE (lhs); > > > > > > + tree arg_type = TREE_TYPE (arg); > > > > > > + tree new_rhs; > > > > > > + > > > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); > > > > > > + if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR) > > > > > > + return false; > > > > > > + > > > > > > + unsigned short target_size; scalar_mode tmp_cvt_mode; > > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE > > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER > > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE; > > > > > > + target_size = GET_MODE_SIZE (lhs_mode); int rhs_size = > > > > > > + GET_MODE_BITSIZE (rhs_mode); if (!int_mode_for_size (rhs_size, > 0).exists (&tmp_cvt_mode)) > > > > > > + return false; > > > > > > + > > > > > > + opt_scalar_mode mode_iter; > > > > > > + enum tree_code tc1, tc2; > > > > > > + unsigned HOST_WIDE_INT nelts > > > > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > > > + > > > > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > > > > + { > > > > > > + tmp_cvt_mode = mode_iter.require (); > > > > > > + > > > > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > > > > + break; > > > > > > + > > > > > > + scalar_mode cvt_mode; > > > > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > > > > + break; > > > > > > + > > > > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || > > > > > > + TYPE_UNSIGNED > > > > > (arg_type); > > > > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > > > > + isUnsigned); > > > > > > + > > > > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > > > > + if (cvt_type == NULL_TREE > > > > > > + || !supportable_convert_operation ((tree_code) code, > > > > > > + ret_type, > > > > > > + cvt_type, &tc1) > > > > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > > > > + cvt_type, > > > > > > + arg_type, &tc2)) > > > > > > + continue; > > > > > > + > > > > > > + new_rhs = make_ssa_name (cvt_type); > > > > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > > > > + gsi_replace (gsi, g, false); > > > > > > + return true; > > > > > > + } > > > > > > + return false; > > > > > > +} > > > > > > + > > > > > > > > > > So the above improve the situation where the target can handle > > > > > the two-step conversion. It doesn't really allow this to work > > > > > for too large vectors AFAICS (nor does it try pack/unpack for > > > > > any of the conversions). It also still duplicates code that's > > > > > in the vectorizer. I think you should be able to use > > > > > supportable_narrowing_operation and possibly even > > > > > supportable_widening_operation (though that needs refatoring to > > > > > avoid the vectorizer internal stmt_vec_info type - possibly > > > > > simply by gating > > > the respective code on a non-NULL vinfo). Both support multi-step > conversions. > > > > > > > > > > > > > I tried to use supportable_narrowing_operation and I met two questions: > > > > > > > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know > > > > which optab can help me convert v16qi to v2qi. > > > > > > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have > > > an equal number of lanes in/out for .CONVERT_VECTOR) it likely > > > outputs a multi-step conversion where you have to look into > > > *INTERM_TYPES and second-guess the operation code to use for the > > > intermediate steps (IIRC the intermediate steps all use either PACK/UNPACK > or CONVERT, never FLOAT/FIX). > > > > > > > I made a mistake in what I said before. I think > > supportable_narrowing_operation doesn't support v2df->v2qi, it only > > use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it > > require that vectype_in and vectype_out have the same size to return > > true. I want to make sure I'm doing the right thing, I can build a > > tmp_type by build_nonstandard_integer_type and get_same_sized_vectype. > > And use tree_vec_extract to extract v2qi from v16qi after > > supportable_narrowing_operation. > > Yes. It looks like the vectorizer, when the vector types number of lanes agree > goes the 'NONE' conversion path, checks supportable_convert_operation and > then has open-coded handling for > > /* For conversions between float and integer types try whether > we can use intermediate signed integer types to support the > conversion. */ > > that means I was wrong in indicating supportable_narrowing_operation was for > element narrowing, it is for number-of-lane "narrowing". > > That said, vectorizable_conversion, in the NONE case has handling that should > be split out into a function that's usable also from vector lowering then so that > both vectorization and lowering handle the same cases. The interface would be > similar to supportable_narrowing_operation. > > > > > > > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is > > > > not what I expected, because it only use vec_pack_trunc. I expect it > > > > can use vcvttpd2dq + vpmovdw. > > > > > > With -O3 -fno-tree-loop-vectorize that's what you get. What you see > > > is because of the restriction of the loop vectorizer to work on a single vector > size only. > > > > > > > Yes, it works, but the program runs the NONE part > > (tree-vect-stmts.cc:5357) instead of the NARROW_DST part > > (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the > > code from line:5373 to line:5455 as a function. This avoids > > duplicating the wheel, and I get the results I'm looking for. > > Yeah. > > > In addition to wrapping the function. If you are motivated by the fact > > that our modifications are not generalized enough, I think we can add > > supportable_narrowing/widening_operation after the current single step > > VEC_CONVERT (line 1972 and line 2078). It should try to use a single > > step and then use multiple steps. If you agree, I'd like to remove my > > changes about indirect conversions for now, and keep only the direct > > conversions, so that I can merge the three current patches into the > > trunk first, and then add the change about indirect conversions later. > > I think it should go like finding the largest compute_vectype pair > (source/destination) that we can handle either directly or indirectly via the new > function. > > Richard. > Thanks, I will wrap the code in the new function and put out a new version of this patch. I have a small question, what does "finding the largest compute_vectype pair" mean? Some piece of code from gcc? BRs, Lin > > > > > > > > > > If I can solve the first question and the function be better > > > > (maybe support trunc<vectype_in><vectype_out>), I'd be happy to > > > > use it directly. I prefer my scheme for now. My functions is more > > > > like supportable_convert_operation. Perhaps, we can modify > > > > supportable_narrowing_operation, but I think it should be another > > > > patch, it will influence vectorizer. > > > > > > But since you are doing a multi-step conversion this is really what > > > supportable_narrowing_operation is about. I don't think we want to > > > re-invent the wheel here. Likewise your approach won't get you to > > > use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current > > > single- step .CONVERT_VECTOR lowering). > > > supportable_narrowing_operation also checks for this. > > > > > > Richard. > > > > > > > > > > BRs, > > > > Lin > > > > > > > > > > > > > > > /* Expand VEC_CONVERT ifn call. */ > > > > > > > > > > > > static void > > > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion > > > > > (gimple_stmt_iterator *gsi) > > > > > > else if (ret_elt_bits > arg_elt_bits) > > > > > > modifier = WIDEN; > > > > > > > > > > > > + if (supportable_convert_operation (code, ret_type, arg_type, > &code1)) > > > > > > + { > > > > > > + g = gimple_build_assign (lhs, code1, arg); > > > > > > + gsi_replace (gsi, g, false); > > > > > > + return; > > > > > > + } > > > > > > + > > > > > > + if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg)) > > > > > > + return; > > > > > > + > > > > > > + if (supportable_indirect_widening_operation(gsi, code, lhs, arg)) > > > > > > + return; > > > > > > + > > > > > > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > > > > > FLOAT_EXPR)) > > > > > > { > > > > > > - if (supportable_convert_operation (code, ret_type, arg_type, > &code1)) > > > > > > - { > > > > > > - g = gimple_build_assign (lhs, code1, arg); > > > > > > - gsi_replace (gsi, g, false); > > > > > > - return; > > > > > > - } > > > > > > /* Can't use get_compute_type here, as > > > supportable_convert_operation > > > > > > doesn't necessarily use an optab and needs two arguments. */ > > > > > > tree vec_compute_type > > > > > > > > > > > > > > > > -- > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG > > > > > Nuernberg) > > > > > > > > > > -- > > > Richard Biener <rguenther@suse.de> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 > > > Nuernberg, Germany; > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG > > > Nuernberg) > > > > -- > Richard Biener <rguenther@suse.de> > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
On Mon, 3 Jun 2024, Hu, Lin1 wrote: > > -----Original Message----- > > From: Richard Biener <rguenther@suse.de> > > Sent: Monday, June 3, 2024 5:03 PM > > To: Hu, Lin1 <lin1.hu@intel.com> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > > ubizjak@gmail.com > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float > > -> float and int <-> float. > > > > On Mon, 3 Jun 2024, Hu, Lin1 wrote: > > > > > > -----Original Message----- > > > > From: Richard Biener <rguenther@suse.de> > > > > Sent: Friday, May 31, 2024 8:41 PM > > > > To: Hu, Lin1 <lin1.hu@intel.com> > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > > > > ubizjak@gmail.com > > > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for > > > > int -> int, float > > > > -> float and int <-> float. > > > > > > > > On Fri, 31 May 2024, Hu, Lin1 wrote: > > > > > > > > > > -----Original Message----- > > > > > > From: Richard Biener <rguenther@suse.de> > > > > > > Sent: Wednesday, May 29, 2024 5:41 PM > > > > > > To: Hu, Lin1 <lin1.hu@intel.com> > > > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao > > > > > > <hongtao.liu@intel.com>; ubizjak@gmail.com > > > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn > > > > > > for int -> int, float > > > > > > -> float and int <-> float. > > > > > > > > > > > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > > > PR target/107432 > > > > > > > * tree-vect-generic.cc > > > > > > > (supportable_indirect_narrowing_operation): New function for > > > > > > > support indirect narrowing convert. > > > > > > > (supportable_indirect_widening_operation): New function for > > > > > > > support indirect widening convert. > > > > > > > (expand_vector_conversion): Support convert for int -> int, > > > > > > > float -> float and int <-> float. > > > > > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > > > > > PR target/107432 > > > > > > > * gcc.target/i386/pr107432-1.c: New test. > > > > > > > * gcc.target/i386/pr107432-2.c: Ditto. > > > > > > > * gcc.target/i386/pr107432-3.c: Ditto. > > > > > > > * gcc.target/i386/pr107432-4.c: Ditto. > > > > > > > * gcc.target/i386/pr107432-5.c: Ditto. > > > > > > > * gcc.target/i386/pr107432-6.c: Ditto. > > > > > > > * gcc.target/i386/pr107432-7.c: Ditto. > > > > > > > --- > > > > > > > diff --git a/gcc/tree-vect-generic.cc > > > > > > > b/gcc/tree-vect-generic.cc index > > > > > > > ab640096ca2..0bedb53d9f9 100644 > > > > > > > --- a/gcc/tree-vect-generic.cc > > > > > > > +++ b/gcc/tree-vect-generic.cc > > > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If > > > > > > > not see #include "gimple-match.h" > > > > > > > #include "recog.h" /* FIXME: for insn_data */ > > > > > > > #include "optabs-libfuncs.h" > > > > > > > +#include "cfgloop.h" > > > > > > > +#include "tree-vectorizer.h" > > > > > > > > > > > > > > > > > > > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > > > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > > > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > > > > > > > return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > > > > > > > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > > > > +conversion > > > > > > for > > > > > > > + float <-> int, like double -> char. */ bool > > > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > > > > > > > + enum tree_code code, > > > > > > > + tree lhs, > > > > > > > + tree arg) > > > > > > > +{ > > > > > > > + gimple *g; > > > > > > > + tree ret_type = TREE_TYPE (lhs); > > > > > > > + tree arg_type = TREE_TYPE (arg); > > > > > > > + tree new_rhs; > > > > > > > + > > > > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); > > > > > > > + if (code != FIX_TRUNC_EXPR || flag_trapping_math || > > > > > > > + ret_elt_bits >= > > > > > > arg_elt_bits) > > > > > > > + return false; > > > > > > > + > > > > > > > + unsigned short target_size; scalar_mode tmp_cvt_mode; > > > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE > > > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER > > > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE; > > > > > > > + tmp_cvt_mode = lhs_mode; target_size = GET_MODE_SIZE > > > > > > > + (rhs_mode); > > > > > > > + > > > > > > > + opt_scalar_mode mode_iter; > > > > > > > + enum tree_code tc1, tc2; > > > > > > > + unsigned HOST_WIDE_INT nelts > > > > > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > > > > + > > > > > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > > > > > + { > > > > > > > + tmp_cvt_mode = mode_iter.require (); > > > > > > > + > > > > > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > > > > > + break; > > > > > > > + > > > > > > > + scalar_mode cvt_mode; > > > > > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > > > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > > > > > + break; > > > > > > > + > > > > > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > > > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || > > > > > > > + TYPE_UNSIGNED > > > > > > (arg_type); > > > > > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > > > > > + isUnsigned); > > > > > > > + > > > > > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > > > > > + if (cvt_type == NULL_TREE > > > > > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > > > > > + ret_type, > > > > > > > + cvt_type, &tc1) > > > > > > > + || !supportable_convert_operation ((tree_code) code, > > > > > > > + cvt_type, > > > > > > > + arg_type, &tc2)) > > > > > > > + continue; > > > > > > > + > > > > > > > + new_rhs = make_ssa_name (cvt_type); > > > > > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > > > > > + gsi_replace (gsi, g, false); > > > > > > > + return true; > > > > > > > + } > > > > > > > + return false; > > > > > > > +} > > > > > > > + > > > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > > > > +conversion > > > > > > for > > > > > > > + float <-> int, like char -> double. */ bool > > > > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi, > > > > > > > + enum tree_code code, > > > > > > > + tree lhs, > > > > > > > + tree arg) > > > > > > > +{ > > > > > > > + gimple *g; > > > > > > > + tree ret_type = TREE_TYPE (lhs); > > > > > > > + tree arg_type = TREE_TYPE (arg); > > > > > > > + tree new_rhs; > > > > > > > + > > > > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); > > > > > > > + if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR) > > > > > > > + return false; > > > > > > > + > > > > > > > + unsigned short target_size; scalar_mode tmp_cvt_mode; > > > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE > > > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER > > > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE; > > > > > > > + target_size = GET_MODE_SIZE (lhs_mode); int rhs_size = > > > > > > > + GET_MODE_BITSIZE (rhs_mode); if (!int_mode_for_size (rhs_size, > > 0).exists (&tmp_cvt_mode)) > > > > > > > + return false; > > > > > > > + > > > > > > > + opt_scalar_mode mode_iter; > > > > > > > + enum tree_code tc1, tc2; > > > > > > > + unsigned HOST_WIDE_INT nelts > > > > > > > + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > > > > + > > > > > > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > > > > > > + { > > > > > > > + tmp_cvt_mode = mode_iter.require (); > > > > > > > + > > > > > > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > > > > > > + break; > > > > > > > + > > > > > > > + scalar_mode cvt_mode; > > > > > > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > > > > > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) > > > > > > > + break; > > > > > > > + > > > > > > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > > > > > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || > > > > > > > + TYPE_UNSIGNED > > > > > > (arg_type); > > > > > > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > > > > > > + isUnsigned); > > > > > > > + > > > > > > > + cvt_type = build_vector_type (cvt_type, nelts); > > > > > > > + if (cvt_type == NULL_TREE > > > > > > > + || !supportable_convert_operation ((tree_code) code, > > > > > > > + ret_type, > > > > > > > + cvt_type, &tc1) > > > > > > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > > > > > > + cvt_type, > > > > > > > + arg_type, &tc2)) > > > > > > > + continue; > > > > > > > + > > > > > > > + new_rhs = make_ssa_name (cvt_type); > > > > > > > + g = vect_gimple_build (new_rhs, tc2, arg); > > > > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > > > > + g = gimple_build_assign (lhs, tc1, new_rhs); > > > > > > > + gsi_replace (gsi, g, false); > > > > > > > + return true; > > > > > > > + } > > > > > > > + return false; > > > > > > > +} > > > > > > > + > > > > > > > > > > > > So the above improve the situation where the target can handle > > > > > > the two-step conversion. It doesn't really allow this to work > > > > > > for too large vectors AFAICS (nor does it try pack/unpack for > > > > > > any of the conversions). It also still duplicates code that's > > > > > > in the vectorizer. I think you should be able to use > > > > > > supportable_narrowing_operation and possibly even > > > > > > supportable_widening_operation (though that needs refatoring to > > > > > > avoid the vectorizer internal stmt_vec_info type - possibly > > > > > > simply by gating > > > > the respective code on a non-NULL vinfo). Both support multi-step > > conversions. > > > > > > > > > > > > > > > > I tried to use supportable_narrowing_operation and I met two questions: > > > > > > > > > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know > > > > > which optab can help me convert v16qi to v2qi. > > > > > > > > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have > > > > an equal number of lanes in/out for .CONVERT_VECTOR) it likely > > > > outputs a multi-step conversion where you have to look into > > > > *INTERM_TYPES and second-guess the operation code to use for the > > > > intermediate steps (IIRC the intermediate steps all use either PACK/UNPACK > > or CONVERT, never FLOAT/FIX). > > > > > > > > > > I made a mistake in what I said before. I think > > > supportable_narrowing_operation doesn't support v2df->v2qi, it only > > > use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it > > > require that vectype_in and vectype_out have the same size to return > > > true. I want to make sure I'm doing the right thing, I can build a > > > tmp_type by build_nonstandard_integer_type and get_same_sized_vectype. > > > And use tree_vec_extract to extract v2qi from v16qi after > > > supportable_narrowing_operation. > > > > Yes. It looks like the vectorizer, when the vector types number of lanes agree > > goes the 'NONE' conversion path, checks supportable_convert_operation and > > then has open-coded handling for > > > > /* For conversions between float and integer types try whether > > we can use intermediate signed integer types to support the > > conversion. */ > > > > that means I was wrong in indicating supportable_narrowing_operation was for > > element narrowing, it is for number-of-lane "narrowing". > > > > That said, vectorizable_conversion, in the NONE case has handling that should > > be split out into a function that's usable also from vector lowering then so that > > both vectorization and lowering handle the same cases. The interface would be > > similar to supportable_narrowing_operation. > > > > > > > > > > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is > > > > > not what I expected, because it only use vec_pack_trunc. I expect it > > > > > can use vcvttpd2dq + vpmovdw. > > > > > > > > With -O3 -fno-tree-loop-vectorize that's what you get. What you see > > > > is because of the restriction of the loop vectorizer to work on a single vector > > size only. > > > > > > > > > > Yes, it works, but the program runs the NONE part > > > (tree-vect-stmts.cc:5357) instead of the NARROW_DST part > > > (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the > > > code from line:5373 to line:5455 as a function. This avoids > > > duplicating the wheel, and I get the results I'm looking for. > > > > Yeah. > > > > > In addition to wrapping the function. If you are motivated by the fact > > > that our modifications are not generalized enough, I think we can add > > > supportable_narrowing/widening_operation after the current single step > > > VEC_CONVERT (line 1972 and line 2078). It should try to use a single > > > step and then use multiple steps. If you agree, I'd like to remove my > > > changes about indirect conversions for now, and keep only the direct > > > conversions, so that I can merge the three current patches into the > > > trunk first, and then add the change about indirect conversions later. > > > > I think it should go like finding the largest compute_vectype pair > > (source/destination) that we can handle either directly or indirectly via the new > > function. > > > > Richard. > > > > Thanks, I will wrap the code in the new function and put out a new > version of this patch. I have a small question, what does "finding the > largest compute_vectype pair" mean? Some piece of code from gcc? No, I mean what vector lowering does for .VEC_CONVERT right now, it uses /* Can't use get_compute_type here, as supportable_convert_operation doesn't necessarily use an optab and needs two arguments. */ tree vec_compute_type = type_for_widest_vector_mode (arg_type, mov_optab); or if (optab1) compute_type = get_compute_type (code1, optab1, arg_type); and then expand_vector_piecewise to emit code for say V4SF -> V4QI from V16SF -> V16QI .VEC_CONVERT. Richard. > BRs, > Lin > > > > > > > > > > > > > > > If I can solve the first question and the function be better > > > > > (maybe support trunc<vectype_in><vectype_out>), I'd be happy to > > > > > use it directly. I prefer my scheme for now. My functions is more > > > > > like supportable_convert_operation. Perhaps, we can modify > > > > > supportable_narrowing_operation, but I think it should be another > > > > > patch, it will influence vectorizer. > > > > > > > > But since you are doing a multi-step conversion this is really what > > > > supportable_narrowing_operation is about. I don't think we want to > > > > re-invent the wheel here. Likewise your approach won't get you to > > > > use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current > > > > single- step .CONVERT_VECTOR lowering). > > > > supportable_narrowing_operation also checks for this. > > > > > > > > Richard. > > > > > > > > > > > > > BRs, > > > > > Lin > > > > > > > > > > > > > > > > > > /* Expand VEC_CONVERT ifn call. */ > > > > > > > > > > > > > > static void > > > > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion > > > > > > (gimple_stmt_iterator *gsi) > > > > > > > else if (ret_elt_bits > arg_elt_bits) > > > > > > > modifier = WIDEN; > > > > > > > > > > > > > > + if (supportable_convert_operation (code, ret_type, arg_type, > > &code1)) > > > > > > > + { > > > > > > > + g = gimple_build_assign (lhs, code1, arg); > > > > > > > + gsi_replace (gsi, g, false); > > > > > > > + return; > > > > > > > + } > > > > > > > + > > > > > > > + if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg)) > > > > > > > + return; > > > > > > > + > > > > > > > + if (supportable_indirect_widening_operation(gsi, code, lhs, arg)) > > > > > > > + return; > > > > > > > + > > > > > > > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > > > > > > FLOAT_EXPR)) > > > > > > > { > > > > > > > - if (supportable_convert_operation (code, ret_type, arg_type, > > &code1)) > > > > > > > - { > > > > > > > - g = gimple_build_assign (lhs, code1, arg); > > > > > > > - gsi_replace (gsi, g, false); > > > > > > > - return; > > > > > > > - } > > > > > > > /* Can't use get_compute_type here, as > > > > supportable_convert_operation > > > > > > > doesn't necessarily use an optab and needs two arguments. */ > > > > > > > tree vec_compute_type > > > > > > > > > > > > > > > > > > > -- > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG > > > > > > Nuernberg) > > > > > > > > > > > > > -- > > > > Richard Biener <rguenther@suse.de> > > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 > > > > Nuernberg, Germany; > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG > > > > Nuernberg) > > > > > > > -- > > Richard Biener <rguenther@suse.de> > > SUSE Software Solutions Germany GmbH, > > Frankenstrasse 146, 90461 Nuernberg, Germany; > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg) >
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 00000000000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include <x86intrin.h> + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2si); +} + +__m128i mm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); +} + +__m256i mm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); +} + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); +} + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); +} + +__m128i mm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); +} + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); +} + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); +} + +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8di)a, __v8qi); +} + +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2hi); +} + +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4hi); +} + +__m128i mm256_cvtepi32_epi16_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); +} + +__m256i mm512_cvtepi32_epi16_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); +} + +__v2qi mm64_cvtepi32_epi8_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2qi); +} + +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4qi); +} + +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v8si)a, __v8qi); +} + +__m128i mm512_cvtepi32_epi8_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); +} + +__v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a) +{ + return __builtin_convertvector((__v2hi)a, __v2qi); +} + +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v8hi)a, __v8qi); +} + +__m128i mm256_cvtepi16_epi8_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); +} + +__m256i mm512_cvtepi16_epi8_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); +} + +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2du)a, __v2su); +} + +__m128i mm256_cvtepu64_epu32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4du)a, __v4su); +} + +__m256i mm512_cvtepu64_epu32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8du)a, __v8su); +} + +__v2hu mm_cvtepu64_epu16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2du)a, __v2hu); +} + +__v4hu mm256_cvtepu64_epu16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4du)a, __v4hu); +} + +__m128i mm512_cvtepu64_epu16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); +} + +__v2qu mm_cvtepu64_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2du)a, __v2qu); +} + +__v4qu mm256_cvtepu64_epu8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4du)a, __v4qu); +} + +__v8qu mm512_cvtepu64_epu8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8du)a, __v8qu); +} + +__v2hu mm32_cvtepu32_epu16_builtin_convertvector(__v2su a) +{ + return __builtin_convertvector((__v2su)a, __v2hu); +} + +__v4hu mm_cvtepu32_epu16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4su)a, __v4hu); +} + +__m128i mm256_cvtepu32_epu16_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); +} + +__m256i mm512_cvtepu32_epu16_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); +} + +__v2qu mm32_cvtepu32_epu8_builtin_convertvector(__v2su a) +{ + return __builtin_convertvector((__v2su)a, __v2qu); +} + +__v4qu mm_cvtepu2_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4su)a, __v4qu); +} + +__v8qu mm256_cvtepu32_epu8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v8su)a, __v8qu); +} + +__m128i mm512_cvtepu32_epu8_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); +} + +__v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a) +{ + return __builtin_convertvector((__v2hu)a, __v2qu); +} + +__v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v8hu)a, __v8qu); +} + +__m128i mm256_cvtepu16_epu8_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); +} + +__m256i mm512_cvtepu16_epu8_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c new file mode 100644 index 00000000000..02ffd811cb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c @@ -0,0 +1,105 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */ + +#include <x86intrin.h> + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector(a, __v2di); +} + +__m256i mm256_cvtepi32_epi64_builtin_convertvector(__v4si a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); +} + +__m512i mm512_cvtepi32_epi64_builtin_convertvector(__v8si a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); +} + +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) +{ + return __builtin_convertvector(a, __v2di); +} + +__m256i mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); +} + +__m512i mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); +} + +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector(a, __v2di); +} + +__m256i mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); +} + +__m512i mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); +} + +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) +{ + return (__m128i)__builtin_convertvector(a, __v4si); +} + +__m256i mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a) +{ + return (__m256i)__builtin_convertvector(a, __v8si); +} + +__m512i mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a) +{ + return (__m512i)__builtin_convertvector(a, __v16si); +} + +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) +{ + return (__m128i)__builtin_convertvector(a, __v4si); +} + +__m256i mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a) +{ + return (__m256i)__builtin_convertvector(a, __v8si); +} + +__m512i mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a) +{ + return (__m512i)__builtin_convertvector(a, __v16si); +} + +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) +{ + return (__m128i)__builtin_convertvector(a, __v8hi); +} + +__m256i mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a) +{ + return (__m256i)__builtin_convertvector(a, __v16hi); +} + +__v32hi mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a) +{ + return __builtin_convertvector(a, __v32hi); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c new file mode 100644 index 00000000000..30dc947b6dd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c @@ -0,0 +1,55 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector(a, __v2sf); +} + +__v4sf mm256_cvtpd_ps_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4sf); +} + +__v8sf mm512_cvtpd_ps_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8sf); +} + +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector(a, __v2hf); +} + +__v4hf mm256_cvtpd_ph_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4hf); +} + +__v8hf mm512_cvtpd_ph_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8hf); +} + +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector(a, __v4hf); +} + +__v8hf mm256_cvtps_ph_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8hf); +} + +__v16hf mm512_cvtps_ph_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector(a, __v16hf); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c new file mode 100644 index 00000000000..e537e7349e4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector(a, __v2df); +} + +__v4df mm256_cvtps_pd_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector(a, __v4df); +} + +__v8df mm512_cvtps_pd_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8df); +} + +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector(a, __v2df); +} + +__v4df mm256_cvtph_pd_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4df); +} + +__v8df mm512_cvtph_pd_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8df); +} + +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4sf); +} + +__v8sf mm256_cvtph_ps_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8sf); +} + +__v16sf mm512_cvtph_ps_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector(a, __v16sf); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c new file mode 100644 index 00000000000..5a44ef9f3b9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c @@ -0,0 +1,72 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */ +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */ +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector(a, __v2si); +} + +__v4si mm256_cvtpd_epi32_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4si); +} + +__v8si mm512_cvtpd_epi32_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8si); +} + +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector(a, __v2di); +} + +__v4di mm256_cvtps_epi64_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector(a, __v4di); +} + +__v8di mm512_cvtps_epi64_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8di); +} + +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4si); +} + +__v8si mm256_cvtph_epi32_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8si); +} + +__v16si mm512_cvtph_epi32_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector(a, __v16si); +} + +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector(a, __v2di); +} + +__v4di mm256_cvtph_epi64_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4di); +} + +__v8di mm512_cvtph_epi64_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8di); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c new file mode 100644 index 00000000000..4a68a10b089 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c @@ -0,0 +1,139 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */ +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include <x86intrin.h> + +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); +typedef char __v16qi __attribute__ ((__vector_size__ (16))); +typedef unsigned char __v2qu __attribute__ ((vector_size (2))); +typedef unsigned char __v4qu __attribute__ ((vector_size (4))); +typedef unsigned char __v8qu __attribute__ ((vector_size (8))); +typedef unsigned char __v16qu __attribute__ ((vector_size (16))); +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); + +__v2qi mm_cvtpd_epi8_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector((__v2df)a, __v2qi); +} + +__v4qi mm256_cvtpd_epi8_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector((__v4df)a, __v4qi); +} + +__v8qi mm512_cvtpd_epi8_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector((__v8df)a, __v8qi); +} + +__v2qu mm_cvtpd_epu8_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector((__v2df)a, __v2qu); +} + +__v4qu mm256_cvtpd_epu8_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector((__v4df)a, __v4qu); +} + +__v8qu mm512_cvtpd_epu8_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector((__v8df)a, __v8qu); +} + +__v2qi mm64_cvtps_epi8_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector((__v2sf)a, __v2qi); +} + +__v4qi mm128_cvtps_epi8_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector((__v4sf)a, __v4qi); +} + +__v8qi mm256_cvtps_epi8_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector((__v8sf)a, __v8qi); +} + +__v16qi mm512_cvtps_epi8_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector((__v16sf)a, __v16qi); +} + +__v2qu mm64_cvtps_epu8_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector((__v2sf)a, __v2qu); +} + +__v4qu mm128_cvtps_epu8_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector((__v4sf)a, __v4qu); +} + +__v8qu mm256_cvtps_epu8_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector((__v8sf)a, __v8qu); +} + +__v16qu mm512_cvtps_epu8_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector((__v16sf)a, __v16qu); +} + +__v2qi mm32_cvtph_epi8_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector((__v2hf)a, __v2qi); +} + +__v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector((__v8hf)a, __v8qi); +} + +__v16qi mm256_cvtph_epi8_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector((__v16hf)a, __v16qi); +} + +__v32qi mm512_cvtph_epi8_builtin_convertvector(__v32hf a) +{ + return __builtin_convertvector((__v32hf)a, __v32qi); +} + +__v2qu mm32_cvtph_epu8_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector((__v2hf)a, __v2qu); +} + +__v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector((__v8hf)a, __v8qu); +} + +__v16qu mm256_cvtph_epu8_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector((__v16hf)a, __v16qu); +} + +__v32qu mm512_cvtph_epu8_builtin_convertvector(__v32hf a) +{ + return __builtin_convertvector((__v32hf)a, __v32qu); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c new file mode 100644 index 00000000000..0ff5a97ed1a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c @@ -0,0 +1,156 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */ +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */ + +#include <x86intrin.h> + +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); +typedef char __v16qi __attribute__ ((__vector_size__ (16))); +typedef unsigned char __v2qu __attribute__ ((vector_size (2))); +typedef unsigned char __v4qu __attribute__ ((vector_size (4))); +typedef unsigned char __v8qu __attribute__ ((vector_size (8))); +typedef unsigned char __v16qu __attribute__ ((vector_size (16))); +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); + +__v2df mm_cvtepi8_pd_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2df); +} + +__v4df mm256_cvtepi8_pd_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4df); +} + +__v8df mm512_cvtepi8_pd_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8df); +} + +__v2df mm_cvtepu8_pd_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2df); +} + +__v4df mm256_cvtepu8_pd_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4df); +} + +__v8df mm512_cvtepu8_pd_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8df); +} + +__v2sf mm64_cvtepi8_ps_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2sf); +} + +__v4sf mm128_cvtepi8_ps_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4sf); +} + +__v8sf mm256_cvtepi8_ps_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8sf); +} + +__v16sf mm512_cvtepi8_ps_builtin_convertvector(__v16qi a) +{ + return __builtin_convertvector((__v16qi)a, __v16sf); +} + +__v2sf mm64_cvtepu8_ps_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2sf); +} + +__v4sf mm128_cvtepu8_ps_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4sf); +} + +__v8sf mm256_cvtepu8_ps_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8sf); +} + +__v16sf mm512_cvtepu8_ps_builtin_convertvector(__v16qu a) +{ + return __builtin_convertvector((__v16qu)a, __v16sf); +} + +__v2hf mm32_cvtepi8_ph_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2hf); +} + +__v4hf mm64_cvtepi8_ph_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4hf); +} + +__v8hf mm128_cvtepi8_ph_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8hf); +} + +__v16hf mm256_cvtepi8_ph_builtin_convertvector(__v16qi a) +{ + return __builtin_convertvector((__v16qi)a, __v16hf); +} + +__v32hf mm512_cvtepi8_ph_builtin_convertvector(__v32qi a) +{ + return __builtin_convertvector((__v32qi)a, __v32hf); +} + +__v2hf mm32_cvtepu8_ph_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2hf); +} + +__v4hf mm64_cvtepu8_ph_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4hf); +} + +__v8hf mm128_cvtepu8_ph_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8hf); +} + +__v16hf mm256_cvtepu8_ph_builtin_convertvector(__v16qu a) +{ + return __builtin_convertvector((__v16qu)a, __v16hf); +} + +__v32hf mm512_cvtepu8_ph_builtin_convertvector(__v32qu a) +{ + return __builtin_convertvector((__v32qu)a, __v32hf); +} diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ab640096ca2..0bedb53d9f9 100644 --- a/gcc/tree-vect-generic.cc +++ b/gcc/tree-vect-generic.cc @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see #include "gimple-match.h" #include "recog.h" /* FIXME: for insn_data */ #include "optabs-libfuncs.h" +#include "cfgloop.h" +#include "tree-vectorizer.h" /* Build a ternary operation and gimplify it. Emit code before GSI. @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a, return gimplify_build2 (gsi, code, outer_type, b, c); } +/* A subroutine of expand_vector_conversion, support indirect conversion for + float <-> int, like double -> char. */ +bool +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, + enum tree_code code, + tree lhs, + tree arg) +{ + gimple *g; + tree ret_type = TREE_TYPE (lhs); + tree arg_type = TREE_TYPE (arg); + tree new_rhs; + + unsigned int ret_elt_bits = vector_element_bits (ret_type); + unsigned int arg_elt_bits = vector_element_bits (arg_type); + if (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= arg_elt_bits) + return false; + + unsigned short target_size; + scalar_mode tmp_cvt_mode; + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); + tree cvt_type = NULL_TREE; + tmp_cvt_mode = lhs_mode; + target_size = GET_MODE_SIZE (rhs_mode); + + opt_scalar_mode mode_iter; + enum tree_code tc1, tc2; + unsigned HOST_WIDE_INT nelts + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); + + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) + { + tmp_cvt_mode = mode_iter.require (); + + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) + break; + + scalar_mode cvt_mode; + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) + break; + + int cvt_size = GET_MODE_BITSIZE (cvt_mode); + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type); + cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned); + + cvt_type = build_vector_type (cvt_type, nelts); + if (cvt_type == NULL_TREE + || !supportable_convert_operation ((tree_code) NOP_EXPR, + ret_type, + cvt_type, &tc1) + || !supportable_convert_operation ((tree_code) code, + cvt_type, + arg_type, &tc2)) + continue; + + new_rhs = make_ssa_name (cvt_type); + g = vect_gimple_build (new_rhs, tc2, arg); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, tc1, new_rhs); + gsi_replace (gsi, g, false); + return true; + } + return false; +} + +/* A subroutine of expand_vector_conversion, support indirect conversion for + float <-> int, like char -> double. */ +bool +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi, + enum tree_code code, + tree lhs, + tree arg) +{ + gimple *g; + tree ret_type = TREE_TYPE (lhs); + tree arg_type = TREE_TYPE (arg); + tree new_rhs; + + unsigned int ret_elt_bits = vector_element_bits (ret_type); + unsigned int arg_elt_bits = vector_element_bits (arg_type); + if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR) + return false; + + unsigned short target_size; + scalar_mode tmp_cvt_mode; + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); + tree cvt_type = NULL_TREE; + target_size = GET_MODE_SIZE (lhs_mode); + int rhs_size = GET_MODE_BITSIZE (rhs_mode); + if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode)) + return false; + + opt_scalar_mode mode_iter; + enum tree_code tc1, tc2; + unsigned HOST_WIDE_INT nelts + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); + + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) + { + tmp_cvt_mode = mode_iter.require (); + + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) + break; + + scalar_mode cvt_mode; + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) + break; + + int cvt_size = GET_MODE_BITSIZE (cvt_mode); + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type); + cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned); + + cvt_type = build_vector_type (cvt_type, nelts); + if (cvt_type == NULL_TREE + || !supportable_convert_operation ((tree_code) code, + ret_type, + cvt_type, &tc1) + || !supportable_convert_operation ((tree_code) NOP_EXPR, + cvt_type, + arg_type, &tc2)) + continue; + + new_rhs = make_ssa_name (cvt_type); + g = vect_gimple_build (new_rhs, tc2, arg); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, tc1, new_rhs); + gsi_replace (gsi, g, false); + return true; + } + return false; +} + /* Expand VEC_CONVERT ifn call. */ static void @@ -1871,14 +2009,21 @@ expand_vector_conversion (gimple_stmt_iterator *gsi) else if (ret_elt_bits > arg_elt_bits) modifier = WIDEN; + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) + { + g = gimple_build_assign (lhs, code1, arg); + gsi_replace (gsi, g, false); + return; + } + + if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg)) + return; + + if (supportable_indirect_widening_operation(gsi, code, lhs, arg)) + return; + if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR)) { - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) - { - g = gimple_build_assign (lhs, code1, arg); - gsi_replace (gsi, g, false); - return; - } /* Can't use get_compute_type here, as supportable_convert_operation doesn't necessarily use an optab and needs two arguments. */ tree vec_compute_type