diff mbox series

[1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

Message ID 20240523063742.2333446-2-lin1.hu@intel.com
State New
Headers show
Series Optimize __builtin_convertvector for x86-64-v4 and | expand

Commit Message

Hu, Lin1 May 23, 2024, 6:37 a.m. UTC
gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc
	(supportable_indirect_narrowing_operation): New function for
	support indirect narrowing convert.
	(supportable_indirect_widening_operation): New function for
	support indirect widening convert.
	(expand_vector_conversion): Support convert for int -> int,
	float -> float and int <-> float.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
 gcc/tree-vect-generic.cc                   | 157 +++++++++++++-
 8 files changed, 968 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

Comments

Richard Biener May 29, 2024, 9:40 a.m. UTC | #1
On Thu, 23 May 2024, Hu, Lin1 wrote:

> gcc/ChangeLog:
> 
> 	PR target/107432
> 	* tree-vect-generic.cc
> 	(supportable_indirect_narrowing_operation): New function for
> 	support indirect narrowing convert.
> 	(supportable_indirect_widening_operation): New function for
> 	support indirect widening convert.
> 	(expand_vector_conversion): Support convert for int -> int,
> 	float -> float and int <-> float.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR target/107432
> 	* gcc.target/i386/pr107432-1.c: New test.
> 	* gcc.target/i386/pr107432-2.c: Ditto.
> 	* gcc.target/i386/pr107432-3.c: Ditto.
> 	* gcc.target/i386/pr107432-4.c: Ditto.
> 	* gcc.target/i386/pr107432-5.c: Ditto.
> 	* gcc.target/i386/pr107432-6.c: Ditto.
> 	* gcc.target/i386/pr107432-7.c: Ditto.
> ---
>  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
>  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
>  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++
>  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
>  gcc/tree-vect-generic.cc                   | 157 +++++++++++++-
>  8 files changed, 968 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> new file mode 100644
> index 00000000000..a4f37447eb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> @@ -0,0 +1,234 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
> +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
> +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
> +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
> +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
> +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2si);
> +}
> +
> +__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
> +}
> +
> +__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
> +}
> +
> +__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi);
> +}
> +
> +__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi);
> +}
> +
> +__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
> +}
> +
> +__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi);
> +}
> +
> +__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi);
> +}
> +
> +__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8di)a, __v8qi);
> +}
> +
> +__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2hi);
> +}
> +
> +__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi);
> +}
> +
> +__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
> +}
> +
> +__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
> +}
> +
> +__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2qi);
> +}
> +
> +__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi);
> +}
> +
> +__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi);
> +}
> +
> +__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
> +}
> +
> +__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector((__v2hi)a, __v2qi);
> +}
> +
> +__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi);
> +}
> +
> +__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
> +}
> +
> +__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
> +}
> +
> +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2su);
> +}
> +
> +__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
> +}
> +
> +__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
> +}
> +
> +__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2hu);
> +}
> +
> +__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4hu);
> +}
> +
> +__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
> +}
> +
> +__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2qu);
> +}
> +
> +__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4qu);
> +}
> +
> +__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8du)a, __v8qu);
> +}
> +
> +__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2hu);
> +}
> +
> +__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4hu);
> +}
> +
> +__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
> +}
> +
> +__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
> +}
> +
> +__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2qu);
> +}
> +
> +__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4qu);
> +}
> +
> +__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8su)a, __v8qu);
> +}
> +
> +__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
> +}
> +
> +__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
> +{
> +  return __builtin_convertvector((__v2hu)a, __v2qu);
> +}
> +
> +__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hu)a, __v8qu);
> +}
> +
> +__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
> +}
> +
> +__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> new file mode 100644
> index 00000000000..02ffd811cb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> @@ -0,0 +1,105 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v4si);
> +}
> +
> +__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si);
> +}
> +
> +__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si);
> +}
> +
> +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v4si);
> +}
> +
> +__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si);
> +}
> +
> +__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si);
> +}
> +
> +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v8hi);
> +}
> +
> +__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v16hi);
> +}
> +
> +__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector(a, __v32hi);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> new file mode 100644
> index 00000000000..30dc947b6dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> @@ -0,0 +1,55 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2sf);
> +}
> +
> +__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4sf);
> +}
> +
> +__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8sf);
> +}
> +
> +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2hf);
> +}
> +
> +__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4hf);
> +}
> +
> +__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8hf);
> +}
> +
> +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4hf);
> +}
> +
> +__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8hf);
> +}
> +
> +__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector(a, __v16hf);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> new file mode 100644
> index 00000000000..e537e7349e4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector(a, __v2df);
> +}
> +
> +__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4df);
> +}
> +
> +__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8df);
> +}
> +
> +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector(a, __v2df);
> +}
> +
> +__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4df);
> +}
> +
> +__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8df);
> +}
> +
> +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16sf);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> new file mode 100644
> index 00000000000..5a44ef9f3b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2si);
> +}
> +
> +__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4si);
> +}
> +
> +__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8si);
> +}
> +
> +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4di);
> +}
> +
> +__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8di);
> +}
> +
> +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4si);
> +}
> +
> +__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8si);
> +}
> +
> +__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16si);
> +}
> +
> +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4di);
> +}
> +
> +__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8di);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> new file mode 100644
> index 00000000000..4a68a10b089
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> @@ -0,0 +1,139 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
> +typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
> +typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
> +typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
> +
> +__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qi);
> +}
> +
> +__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qi);
> +}
> +
> +__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qi);
> +}
> +
> +__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qu);
> +}
> +
> +__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qu);
> +}
> +
> +__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qu);
> +}
> +
> +__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qi);
> +}
> +
> +__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qi);
> +}
> +
> +__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qi);
> +}
> +
> +__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qi);
> +}
> +
> +__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qu);
> +}
> +
> +__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qu);
> +}
> +
> +__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qu);
> +}
> +
> +__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qu);
> +}
> +
> +__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qi);
> +}
> +
> +__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qi);
> +}
> +
> +__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qi);
> +}
> +
> +__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qi);
> +}
> +
> +__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qu);
> +}
> +
> +__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qu);
> +}
> +
> +__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qu);
> +}
> +
> +__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qu);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> new file mode 100644
> index 00000000000..0ff5a97ed1a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> @@ -0,0 +1,156 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
> +typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
> +typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
> +typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
> +
> +__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2df);
> +}
> +
> +__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4df);
> +}
> +
> +__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8df);
> +}
> +
> +__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2df);
> +}
> +
> +__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4df);
> +}
> +
> +__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8df);
> +}
> +
> +__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2sf);
> +}
> +
> +__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16sf);
> +}
> +
> +__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2sf);
> +}
> +
> +__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16sf);
> +}
> +
> +__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2hf);
> +}
> +
> +__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4hf);
> +}
> +
> +__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8hf);
> +}
> +
> +__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16hf);
> +}
> +
> +__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector((__v32qi)a, __v32hf);
> +}
> +
> +__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2hf);
> +}
> +
> +__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4hf);
> +}
> +
> +__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8hf);
> +}
> +
> +__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16hf);
> +}
> +
> +__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
> +{
> +  return __builtin_convertvector((__v32qu)a, __v32hf);
> +}
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index ab640096ca2..0bedb53d9f9 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "gimple-match.h"
>  #include "recog.h"		/* FIXME: for insn_data */
>  #include "optabs-libfuncs.h"
> +#include "cfgloop.h"
> +#include "tree-vectorizer.h"
>  
>  
>  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
>    return gimplify_build2 (gsi, code, outer_type, b, c);
>  }
>  
> +/* A subroutine of expand_vector_conversion, support indirect conversion for
> +   float <-> int, like double -> char.  */
> +bool
> +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> +					 enum tree_code code,
> +					 tree lhs,
> +					 tree arg)
> +{
> +  gimple *g;
> +  tree ret_type = TREE_TYPE (lhs);
> +  tree arg_type = TREE_TYPE (arg);
> +  tree new_rhs;
> +
> +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> +  unsigned int arg_elt_bits = vector_element_bits (arg_type);
> +  if (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= arg_elt_bits)
> +    return false;
> +
> +  unsigned short target_size;
> +  scalar_mode tmp_cvt_mode;
> +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> +  tree cvt_type = NULL_TREE;
> +  tmp_cvt_mode = lhs_mode;
> +  target_size = GET_MODE_SIZE (rhs_mode);
> +
> +  opt_scalar_mode mode_iter;
> +  enum tree_code tc1, tc2;
> +  unsigned HOST_WIDE_INT nelts
> +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> +
> +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> +    {
> +      tmp_cvt_mode = mode_iter.require ();
> +
> +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> +	break;
> +
> +      scalar_mode cvt_mode;
> +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> +	break;
> +
> +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
> +      cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
> +
> +      cvt_type = build_vector_type (cvt_type, nelts);
> +      if (cvt_type == NULL_TREE
> +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> +					     ret_type,
> +					     cvt_type, &tc1)
> +	  || !supportable_convert_operation ((tree_code) code,
> +					     cvt_type,
> +					     arg_type, &tc2))
> +	continue;
> +
> +      new_rhs = make_ssa_name (cvt_type);
> +      g = vect_gimple_build (new_rhs, tc2, arg);
> +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +      g = gimple_build_assign (lhs, tc1, new_rhs);
> +      gsi_replace (gsi, g, false);
> +      return true;
> +    }
> +  return false;
> +}
> +
> +/* A subroutine of expand_vector_conversion, support indirect conversion for
> +   float <-> int, like char -> double.  */
> +bool
> +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> +					 enum tree_code code,
> +					 tree lhs,
> +					 tree arg)
> +{
> +  gimple *g;
> +  tree ret_type = TREE_TYPE (lhs);
> +  tree arg_type = TREE_TYPE (arg);
> +  tree new_rhs;
> +
> +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> +  unsigned int arg_elt_bits = vector_element_bits (arg_type);
> +  if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> +    return false;
> +
> +  unsigned short target_size;
> +  scalar_mode tmp_cvt_mode;
> +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> +  tree cvt_type = NULL_TREE;
> +  target_size = GET_MODE_SIZE (lhs_mode);
> +  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
> +  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> +    return false;
> +
> +  opt_scalar_mode mode_iter;
> +  enum tree_code tc1, tc2;
> +  unsigned HOST_WIDE_INT nelts
> +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> +
> +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> +    {
> +      tmp_cvt_mode = mode_iter.require ();
> +
> +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> +	break;
> +
> +      scalar_mode cvt_mode;
> +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> +	break;
> +
> +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
> +      cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
> +
> +      cvt_type = build_vector_type (cvt_type, nelts);
> +      if (cvt_type == NULL_TREE
> +	  || !supportable_convert_operation ((tree_code) code,
> +					     ret_type,
> +					     cvt_type, &tc1)
> +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> +					     cvt_type,
> +					     arg_type, &tc2))
> +	continue;
> +
> +      new_rhs = make_ssa_name (cvt_type);
> +      g = vect_gimple_build (new_rhs, tc2, arg);
> +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +      g = gimple_build_assign (lhs, tc1, new_rhs);
> +      gsi_replace (gsi, g, false);
> +      return true;
> +    }
> +  return false;
> +}
> +

So the above improve the situation where the target can handle
the two-step conversion.  It doesn't really allow this to work
for too large vectors AFAICS (nor does it try pack/unpack for
any of the conversions).  It also still duplicates code
that's in the vectorizer.  I think you should be able to use
supportable_narrowing_operation and possibly even
supportable_widening_operation (though that needs refatoring to
avoid the vectorizer internal stmt_vec_info type - possibly
simply by gating the respective code on a non-NULL vinfo).  Both 
support multi-step conversions.

>  /* Expand VEC_CONVERT ifn call.  */
>  
>  static void
> @@ -1871,14 +2009,21 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
>    else if (ret_elt_bits > arg_elt_bits)
>      modifier = WIDEN;
>  
> +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> +    {
> +      g = gimple_build_assign (lhs, code1, arg);
> +      gsi_replace (gsi, g, false);
> +      return;
> +    }
> +
> +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> +    return;
> +
> +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> +    return;
> +
>    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
>      {
> -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> -	{
> -	  g = gimple_build_assign (lhs, code1, arg);
> -	  gsi_replace (gsi, g, false);
> -	  return;
> -	}
>        /* Can't use get_compute_type here, as supportable_convert_operation
>  	 doesn't necessarily use an optab and needs two arguments.  */
>        tree vec_compute_type
>
Hu, Lin1 May 31, 2024, 8:54 a.m. UTC | #2
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Wednesday, May 29, 2024 5:41 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> -> float and int <-> float.
> 
> On Thu, 23 May 2024, Hu, Lin1 wrote:
> 
> > gcc/ChangeLog:
> >
> > 	PR target/107432
> > 	* tree-vect-generic.cc
> > 	(supportable_indirect_narrowing_operation): New function for
> > 	support indirect narrowing convert.
> > 	(supportable_indirect_widening_operation): New function for
> > 	support indirect widening convert.
> > 	(expand_vector_conversion): Support convert for int -> int,
> > 	float -> float and int <-> float.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	PR target/107432
> > 	* gcc.target/i386/pr107432-1.c: New test.
> > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > ---
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > ab640096ca2..0bedb53d9f9 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
> > #include "gimple-match.h"
> >  #include "recog.h"		/* FIXME: for insn_data */
> >  #include "optabs-libfuncs.h"
> > +#include "cfgloop.h"
> > +#include "tree-vectorizer.h"
> >
> >
> >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> >
> > +/* A subroutine of expand_vector_conversion, support indirect conversion
> for
> > +   float <-> int, like double -> char.  */ bool
> > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > +					 enum tree_code code,
> > +					 tree lhs,
> > +					 tree arg)
> > +{
> > +  gimple *g;
> > +  tree ret_type = TREE_TYPE (lhs);
> > +  tree arg_type = TREE_TYPE (arg);
> > +  tree new_rhs;
> > +
> > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> arg_elt_bits)
> > +    return false;
> > +
> > +  unsigned short target_size;
> > +  scalar_mode tmp_cvt_mode;
> > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > + cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;  target_size =
> > + GET_MODE_SIZE (rhs_mode);
> > +
> > +  opt_scalar_mode mode_iter;
> > +  enum tree_code tc1, tc2;
> > +  unsigned HOST_WIDE_INT nelts
> > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > +
> > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > +    {
> > +      tmp_cvt_mode = mode_iter.require ();
> > +
> > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > +	break;
> > +
> > +      scalar_mode cvt_mode;
> > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > +	break;
> > +
> > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> (arg_type);
> > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > + isUnsigned);
> > +
> > +      cvt_type = build_vector_type (cvt_type, nelts);
> > +      if (cvt_type == NULL_TREE
> > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > +					     ret_type,
> > +					     cvt_type, &tc1)
> > +	  || !supportable_convert_operation ((tree_code) code,
> > +					     cvt_type,
> > +					     arg_type, &tc2))
> > +	continue;
> > +
> > +      new_rhs = make_ssa_name (cvt_type);
> > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > +      gsi_replace (gsi, g, false);
> > +      return true;
> > +    }
> > +  return false;
> > +}
> > +
> > +/* A subroutine of expand_vector_conversion, support indirect conversion
> for
> > +   float <-> int, like char -> double.  */ bool
> > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > +					 enum tree_code code,
> > +					 tree lhs,
> > +					 tree arg)
> > +{
> > +  gimple *g;
> > +  tree ret_type = TREE_TYPE (lhs);
> > +  tree arg_type = TREE_TYPE (arg);
> > +  tree new_rhs;
> > +
> > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > +    return false;
> > +
> > +  unsigned short target_size;
> > +  scalar_mode tmp_cvt_mode;
> > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > + cvt_type = NULL_TREE;  target_size = GET_MODE_SIZE (lhs_mode);  int
> > + rhs_size = GET_MODE_BITSIZE (rhs_mode);  if (!int_mode_for_size
> > + (rhs_size, 0).exists (&tmp_cvt_mode))
> > +    return false;
> > +
> > +  opt_scalar_mode mode_iter;
> > +  enum tree_code tc1, tc2;
> > +  unsigned HOST_WIDE_INT nelts
> > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > +
> > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > +    {
> > +      tmp_cvt_mode = mode_iter.require ();
> > +
> > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > +	break;
> > +
> > +      scalar_mode cvt_mode;
> > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > +	break;
> > +
> > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> (arg_type);
> > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > + isUnsigned);
> > +
> > +      cvt_type = build_vector_type (cvt_type, nelts);
> > +      if (cvt_type == NULL_TREE
> > +	  || !supportable_convert_operation ((tree_code) code,
> > +					     ret_type,
> > +					     cvt_type, &tc1)
> > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > +					     cvt_type,
> > +					     arg_type, &tc2))
> > +	continue;
> > +
> > +      new_rhs = make_ssa_name (cvt_type);
> > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > +      gsi_replace (gsi, g, false);
> > +      return true;
> > +    }
> > +  return false;
> > +}
> > +
> 
> So the above improve the situation where the target can handle the two-step
> conversion.  It doesn't really allow this to work for too large vectors AFAICS (nor
> does it try pack/unpack for any of the conversions).  It also still duplicates code
> that's in the vectorizer.  I think you should be able to use
> supportable_narrowing_operation and possibly even
> supportable_widening_operation (though that needs refatoring to avoid the
> vectorizer internal stmt_vec_info type - possibly simply by gating the respective
> code on a non-NULL vinfo).  Both support multi-step conversions.
>

I tried to use supportable_narrowing_operation and I met two questions:

1) supportable_narrowing_operation support v2df->v16qi, but I don't know which optab can help me convert v16qi to v2qi.
2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is not what I expected, because it only use vec_pack_trunc. I expect it can use vcvttpd2dq + vpmovdw.

If I can solve the first question and the function be better (maybe support trunc<vectype_in><vectype_out>), I'd be happy to use it directly. I prefer my scheme for now. My functions is more like supportable_convert_operation. Perhaps, we can modify supportable_narrowing_operation, but I think it should be another patch, it will influence vectorizer.

BRs,
Lin

>
> >  /* Expand VEC_CONVERT ifn call.  */
> >
> >  static void
> > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> (gimple_stmt_iterator *gsi)
> >    else if (ret_elt_bits > arg_elt_bits)
> >      modifier = WIDEN;
> >
> > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > +    {
> > +      g = gimple_build_assign (lhs, code1, arg);
> > +      gsi_replace (gsi, g, false);
> > +      return;
> > +    }
> > +
> > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > +    return;
> > +
> > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > +    return;
> > +
> >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> FLOAT_EXPR))
> >      {
> > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > -	{
> > -	  g = gimple_build_assign (lhs, code1, arg);
> > -	  gsi_replace (gsi, g, false);
> > -	  return;
> > -	}
> >        /* Can't use get_compute_type here, as supportable_convert_operation
> >  	 doesn't necessarily use an optab and needs two arguments.  */
> >        tree vec_compute_type
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
Richard Biener May 31, 2024, 12:41 p.m. UTC | #3
On Fri, 31 May 2024, Hu, Lin1 wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Wednesday, May 29, 2024 5:41 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com
> > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> > -> float and int <-> float.
> > 
> > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > 
> > > gcc/ChangeLog:
> > >
> > > 	PR target/107432
> > > 	* tree-vect-generic.cc
> > > 	(supportable_indirect_narrowing_operation): New function for
> > > 	support indirect narrowing convert.
> > > 	(supportable_indirect_widening_operation): New function for
> > > 	support indirect widening convert.
> > > 	(expand_vector_conversion): Support convert for int -> int,
> > > 	float -> float and int <-> float.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	PR target/107432
> > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > ---
> > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > > ab640096ca2..0bedb53d9f9 100644
> > > --- a/gcc/tree-vect-generic.cc
> > > +++ b/gcc/tree-vect-generic.cc
> > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
> > > #include "gimple-match.h"
> > >  #include "recog.h"		/* FIXME: for insn_data */
> > >  #include "optabs-libfuncs.h"
> > > +#include "cfgloop.h"
> > > +#include "tree-vectorizer.h"
> > >
> > >
> > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > >
> > > +/* A subroutine of expand_vector_conversion, support indirect conversion
> > for
> > > +   float <-> int, like double -> char.  */ bool
> > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > +					 enum tree_code code,
> > > +					 tree lhs,
> > > +					 tree arg)
> > > +{
> > > +  gimple *g;
> > > +  tree ret_type = TREE_TYPE (lhs);
> > > +  tree arg_type = TREE_TYPE (arg);
> > > +  tree new_rhs;
> > > +
> > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> > arg_elt_bits)
> > > +    return false;
> > > +
> > > +  unsigned short target_size;
> > > +  scalar_mode tmp_cvt_mode;
> > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > > + cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;  target_size =
> > > + GET_MODE_SIZE (rhs_mode);
> > > +
> > > +  opt_scalar_mode mode_iter;
> > > +  enum tree_code tc1, tc2;
> > > +  unsigned HOST_WIDE_INT nelts
> > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > +
> > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > +    {
> > > +      tmp_cvt_mode = mode_iter.require ();
> > > +
> > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > +	break;
> > > +
> > > +      scalar_mode cvt_mode;
> > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > +	break;
> > > +
> > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > (arg_type);
> > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > + isUnsigned);
> > > +
> > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > +      if (cvt_type == NULL_TREE
> > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > +					     ret_type,
> > > +					     cvt_type, &tc1)
> > > +	  || !supportable_convert_operation ((tree_code) code,
> > > +					     cvt_type,
> > > +					     arg_type, &tc2))
> > > +	continue;
> > > +
> > > +      new_rhs = make_ssa_name (cvt_type);
> > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > +      gsi_replace (gsi, g, false);
> > > +      return true;
> > > +    }
> > > +  return false;
> > > +}
> > > +
> > > +/* A subroutine of expand_vector_conversion, support indirect conversion
> > for
> > > +   float <-> int, like char -> double.  */ bool
> > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > +					 enum tree_code code,
> > > +					 tree lhs,
> > > +					 tree arg)
> > > +{
> > > +  gimple *g;
> > > +  tree ret_type = TREE_TYPE (lhs);
> > > +  tree arg_type = TREE_TYPE (arg);
> > > +  tree new_rhs;
> > > +
> > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > +    return false;
> > > +
> > > +  unsigned short target_size;
> > > +  scalar_mode tmp_cvt_mode;
> > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > > + cvt_type = NULL_TREE;  target_size = GET_MODE_SIZE (lhs_mode);  int
> > > + rhs_size = GET_MODE_BITSIZE (rhs_mode);  if (!int_mode_for_size
> > > + (rhs_size, 0).exists (&tmp_cvt_mode))
> > > +    return false;
> > > +
> > > +  opt_scalar_mode mode_iter;
> > > +  enum tree_code tc1, tc2;
> > > +  unsigned HOST_WIDE_INT nelts
> > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > +
> > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > +    {
> > > +      tmp_cvt_mode = mode_iter.require ();
> > > +
> > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > +	break;
> > > +
> > > +      scalar_mode cvt_mode;
> > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > +	break;
> > > +
> > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > (arg_type);
> > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > + isUnsigned);
> > > +
> > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > +      if (cvt_type == NULL_TREE
> > > +	  || !supportable_convert_operation ((tree_code) code,
> > > +					     ret_type,
> > > +					     cvt_type, &tc1)
> > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > +					     cvt_type,
> > > +					     arg_type, &tc2))
> > > +	continue;
> > > +
> > > +      new_rhs = make_ssa_name (cvt_type);
> > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > +      gsi_replace (gsi, g, false);
> > > +      return true;
> > > +    }
> > > +  return false;
> > > +}
> > > +
> > 
> > So the above improve the situation where the target can handle the two-step
> > conversion.  It doesn't really allow this to work for too large vectors AFAICS (nor
> > does it try pack/unpack for any of the conversions).  It also still duplicates code
> > that's in the vectorizer.  I think you should be able to use
> > supportable_narrowing_operation and possibly even
> > supportable_widening_operation (though that needs refatoring to avoid the
> > vectorizer internal stmt_vec_info type - possibly simply by gating the respective
> > code on a non-NULL vinfo).  Both support multi-step conversions.
> >
> 
> I tried to use supportable_narrowing_operation and I met two questions:
> 
> 1) supportable_narrowing_operation support v2df->v16qi, but I don't know 
>    which optab can help me convert v16qi to v2qi.

It's API is a bit tricky but for v2df -> v2qi (I expect you'll have
an equal number of lanes in/out for .CONVERT_VECTOR) it likely outputs
a multi-step conversion where you have to look into *INTERM_TYPES
and second-guess the operation code to use for the intermediate steps
(IIRC the intermediate steps all use either PACK/UNPACK or CONVERT,
never FLOAT/FIX).

> 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is 
>    not what I expected, because it only use vec_pack_trunc. I expect it 
>    can use vcvttpd2dq + vpmovdw.

With -O3 -fno-tree-loop-vectorize that's what you get.  What you see
is because of the restriction of the loop vectorizer to work on a
single vector size only.

> If I can solve the first question and the function be better (maybe 
> support trunc<vectype_in><vectype_out>), I'd be happy to use it 
> directly. I prefer my scheme for now. My functions is more like 
> supportable_convert_operation. Perhaps, we can modify 
> supportable_narrowing_operation, but I think it should be another patch, 
> it will influence vectorizer.

But since you are doing a multi-step conversion this is really what
supportable_narrowing_operation is about.  I don't think we want to
re-invent the wheel here.  Likewise your approach won't get you
to use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the
current single-step .CONVERT_VECTOR lowering).  
supportable_narrowing_operation also checks for this.

Richard.

> BRs,
> Lin
> 
> >
> > >  /* Expand VEC_CONVERT ifn call.  */
> > >
> > >  static void
> > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > (gimple_stmt_iterator *gsi)
> > >    else if (ret_elt_bits > arg_elt_bits)
> > >      modifier = WIDEN;
> > >
> > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > +    {
> > > +      g = gimple_build_assign (lhs, code1, arg);
> > > +      gsi_replace (gsi, g, false);
> > > +      return;
> > > +    }
> > > +
> > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > +    return;
> > > +
> > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > +    return;
> > > +
> > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > FLOAT_EXPR))
> > >      {
> > > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > -	{
> > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > -	  gsi_replace (gsi, g, false);
> > > -	  return;
> > > -	}
> > >        /* Can't use get_compute_type here, as supportable_convert_operation
> > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > >        tree vec_compute_type
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
>
Hu, Lin1 June 3, 2024, 8:23 a.m. UTC | #4
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, May 31, 2024 8:41 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> -> float and int <-> float.
> 
> On Fri, 31 May 2024, Hu, Lin1 wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > > ubizjak@gmail.com
> > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for
> > > int -> int, float
> > > -> float and int <-> float.
> > >
> > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	PR target/107432
> > > > 	* tree-vect-generic.cc
> > > > 	(supportable_indirect_narrowing_operation): New function for
> > > > 	support indirect narrowing convert.
> > > > 	(supportable_indirect_widening_operation): New function for
> > > > 	support indirect widening convert.
> > > > 	(expand_vector_conversion): Support convert for int -> int,
> > > > 	float -> float and int <-> float.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > 	PR target/107432
> > > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > > ---
> > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> > > > index
> > > > ab640096ca2..0bedb53d9f9 100644
> > > > --- a/gcc/tree-vect-generic.cc
> > > > +++ b/gcc/tree-vect-generic.cc
> > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not
> > > > see #include "gimple-match.h"
> > > >  #include "recog.h"		/* FIXME: for insn_data */
> > > >  #include "optabs-libfuncs.h"
> > > > +#include "cfgloop.h"
> > > > +#include "tree-vectorizer.h"
> > > >
> > > >
> > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > >
> > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > +conversion
> > > for
> > > > +   float <-> int, like double -> char.  */ bool
> > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > +					 enum tree_code code,
> > > > +					 tree lhs,
> > > > +					 tree arg)
> > > > +{
> > > > +  gimple *g;
> > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > +  tree arg_type = TREE_TYPE (arg);
> > > > +  tree new_rhs;
> > > > +
> > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> > > arg_elt_bits)
> > > > +    return false;
> > > > +
> > > > +  unsigned short target_size;
> > > > +  scalar_mode tmp_cvt_mode;
> > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > + tree cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;
> > > > + target_size = GET_MODE_SIZE (rhs_mode);
> > > > +
> > > > +  opt_scalar_mode mode_iter;
> > > > +  enum tree_code tc1, tc2;
> > > > +  unsigned HOST_WIDE_INT nelts
> > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > +
> > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > +    {
> > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > +
> > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > +	break;
> > > > +
> > > > +      scalar_mode cvt_mode;
> > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > +	break;
> > > > +
> > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > > (arg_type);
> > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > + isUnsigned);
> > > > +
> > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > +      if (cvt_type == NULL_TREE
> > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > +					     ret_type,
> > > > +					     cvt_type, &tc1)
> > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > +					     cvt_type,
> > > > +					     arg_type, &tc2))
> > > > +	continue;
> > > > +
> > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > +      gsi_replace (gsi, g, false);
> > > > +      return true;
> > > > +    }
> > > > +  return false;
> > > > +}
> > > > +
> > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > +conversion
> > > for
> > > > +   float <-> int, like char -> double.  */ bool
> > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > > +					 enum tree_code code,
> > > > +					 tree lhs,
> > > > +					 tree arg)
> > > > +{
> > > > +  gimple *g;
> > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > +  tree arg_type = TREE_TYPE (arg);
> > > > +  tree new_rhs;
> > > > +
> > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > > +    return false;
> > > > +
> > > > +  unsigned short target_size;
> > > > +  scalar_mode tmp_cvt_mode;
> > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > + tree cvt_type = NULL_TREE;  target_size = GET_MODE_SIZE
> > > > + (lhs_mode);  int rhs_size = GET_MODE_BITSIZE (rhs_mode);  if
> > > > + (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> > > > +    return false;
> > > > +
> > > > +  opt_scalar_mode mode_iter;
> > > > +  enum tree_code tc1, tc2;
> > > > +  unsigned HOST_WIDE_INT nelts
> > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > +
> > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > +    {
> > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > +
> > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > +	break;
> > > > +
> > > > +      scalar_mode cvt_mode;
> > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > +	break;
> > > > +
> > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > > (arg_type);
> > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > + isUnsigned);
> > > > +
> > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > +      if (cvt_type == NULL_TREE
> > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > +					     ret_type,
> > > > +					     cvt_type, &tc1)
> > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > +					     cvt_type,
> > > > +					     arg_type, &tc2))
> > > > +	continue;
> > > > +
> > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > +      gsi_replace (gsi, g, false);
> > > > +      return true;
> > > > +    }
> > > > +  return false;
> > > > +}
> > > > +
> > >
> > > So the above improve the situation where the target can handle the
> > > two-step conversion.  It doesn't really allow this to work for too
> > > large vectors AFAICS (nor does it try pack/unpack for any of the
> > > conversions).  It also still duplicates code that's in the
> > > vectorizer.  I think you should be able to use
> > > supportable_narrowing_operation and possibly even
> > > supportable_widening_operation (though that needs refatoring to
> > > avoid the vectorizer internal stmt_vec_info type - possibly simply by gating
> the respective code on a non-NULL vinfo).  Both support multi-step conversions.
> > >
> >
> > I tried to use supportable_narrowing_operation and I met two questions:
> >
> > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know
> >    which optab can help me convert v16qi to v2qi.
> 
> It's API is a bit tricky but for v2df -> v2qi (I expect you'll have an equal number of
> lanes in/out for .CONVERT_VECTOR) it likely outputs a multi-step conversion
> where you have to look into *INTERM_TYPES and second-guess the operation
> code to use for the intermediate steps (IIRC the intermediate steps all use either
> PACK/UNPACK or CONVERT, never FLOAT/FIX).
>

I made a mistake in what I said before. I think supportable_narrowing_operation doesn't support v2df->v2qi, it only use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it require that vectype_in and vectype_out have the same size to return true. I want to make sure I'm doing the right thing, I can build a tmp_type by build_nonstandard_integer_type and get_same_sized_vectype. And use tree_vec_extract to extract v2qi from v16qi after supportable_narrowing_operation.

> 
> > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is
> >    not what I expected, because it only use vec_pack_trunc. I expect it
> >    can use vcvttpd2dq + vpmovdw.
> 
> With -O3 -fno-tree-loop-vectorize that's what you get.  What you see is because
> of the restriction of the loop vectorizer to work on a single vector size only.
>

Yes, it works, but the program runs the NONE part (tree-vect-stmts.cc:5357) instead of the NARROW_DST part (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the code from line:5373 to line:5455 as a function. This avoids duplicating the wheel, and I get the results I'm looking for. 

In addition to wrapping the function. If you are motivated by the fact that our modifications are not generalized enough, I think we can add supportable_narrowing/widening_operation after the current single step VEC_CONVERT (line 1972 and line 2078). It should try to use a single step and then use multiple steps. If you agree, I'd like to remove my changes about indirect conversions for now, and keep only the direct conversions, so that I can merge the three current patches into the trunk first, and then add the change about indirect conversions later.

BRs,
Lin
 
>
> > If I can solve the first question and the function be better (maybe
> > support trunc<vectype_in><vectype_out>), I'd be happy to use it
> > directly. I prefer my scheme for now. My functions is more like
> > supportable_convert_operation. Perhaps, we can modify
> > supportable_narrowing_operation, but I think it should be another
> > patch, it will influence vectorizer.
> 
> But since you are doing a multi-step conversion this is really what
> supportable_narrowing_operation is about.  I don't think we want to re-invent
> the wheel here.  Likewise your approach won't get you to use
> VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current single-
> step .CONVERT_VECTOR lowering).
> supportable_narrowing_operation also checks for this.
> 
> Richard.
> 
>
> > BRs,
> > Lin
> >
> > >
> > > >  /* Expand VEC_CONVERT ifn call.  */
> > > >
> > > >  static void
> > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > > (gimple_stmt_iterator *gsi)
> > > >    else if (ret_elt_bits > arg_elt_bits)
> > > >      modifier = WIDEN;
> > > >
> > > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > +    {
> > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > +      gsi_replace (gsi, g, false);
> > > > +      return;
> > > > +    }
> > > > +
> > > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > > +    return;
> > > > +
> > > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > > +    return;
> > > > +
> > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > FLOAT_EXPR))
> > > >      {
> > > > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > -	{
> > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > -	  gsi_replace (gsi, g, false);
> > > > -	  return;
> > > > -	}
> > > >        /* Can't use get_compute_type here, as
> supportable_convert_operation
> > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > >        tree vec_compute_type
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
Richard Biener June 3, 2024, 9:02 a.m. UTC | #5
On Mon, 3 Jun 2024, Hu, Lin1 wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Friday, May 31, 2024 8:41 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com
> > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> > -> float and int <-> float.
> > 
> > On Fri, 31 May 2024, Hu, Lin1 wrote:
> > 
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > > > ubizjak@gmail.com
> > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for
> > > > int -> int, float
> > > > -> float and int <-> float.
> > > >
> > > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > 	PR target/107432
> > > > > 	* tree-vect-generic.cc
> > > > > 	(supportable_indirect_narrowing_operation): New function for
> > > > > 	support indirect narrowing convert.
> > > > > 	(supportable_indirect_widening_operation): New function for
> > > > > 	support indirect widening convert.
> > > > > 	(expand_vector_conversion): Support convert for int -> int,
> > > > > 	float -> float and int <-> float.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > 	PR target/107432
> > > > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > > > ---
> > > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> > > > > index
> > > > > ab640096ca2..0bedb53d9f9 100644
> > > > > --- a/gcc/tree-vect-generic.cc
> > > > > +++ b/gcc/tree-vect-generic.cc
> > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not
> > > > > see #include "gimple-match.h"
> > > > >  #include "recog.h"		/* FIXME: for insn_data */
> > > > >  #include "optabs-libfuncs.h"
> > > > > +#include "cfgloop.h"
> > > > > +#include "tree-vectorizer.h"
> > > > >
> > > > >
> > > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > > >
> > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > +conversion
> > > > for
> > > > > +   float <-> int, like double -> char.  */ bool
> > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > > +					 enum tree_code code,
> > > > > +					 tree lhs,
> > > > > +					 tree arg)
> > > > > +{
> > > > > +  gimple *g;
> > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > +  tree new_rhs;
> > > > > +
> > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> > > > arg_elt_bits)
> > > > > +    return false;
> > > > > +
> > > > > +  unsigned short target_size;
> > > > > +  scalar_mode tmp_cvt_mode;
> > > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > > + tree cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;
> > > > > + target_size = GET_MODE_SIZE (rhs_mode);
> > > > > +
> > > > > +  opt_scalar_mode mode_iter;
> > > > > +  enum tree_code tc1, tc2;
> > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > +
> > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > +    {
> > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > +
> > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > +	break;
> > > > > +
> > > > > +      scalar_mode cvt_mode;
> > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > +	break;
> > > > > +
> > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > > > (arg_type);
> > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > + isUnsigned);
> > > > > +
> > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > +      if (cvt_type == NULL_TREE
> > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > +					     ret_type,
> > > > > +					     cvt_type, &tc1)
> > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > +					     cvt_type,
> > > > > +					     arg_type, &tc2))
> > > > > +	continue;
> > > > > +
> > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > +      gsi_replace (gsi, g, false);
> > > > > +      return true;
> > > > > +    }
> > > > > +  return false;
> > > > > +}
> > > > > +
> > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > +conversion
> > > > for
> > > > > +   float <-> int, like char -> double.  */ bool
> > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > > > +					 enum tree_code code,
> > > > > +					 tree lhs,
> > > > > +					 tree arg)
> > > > > +{
> > > > > +  gimple *g;
> > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > +  tree new_rhs;
> > > > > +
> > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > > > +    return false;
> > > > > +
> > > > > +  unsigned short target_size;
> > > > > +  scalar_mode tmp_cvt_mode;
> > > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > > + tree cvt_type = NULL_TREE;  target_size = GET_MODE_SIZE
> > > > > + (lhs_mode);  int rhs_size = GET_MODE_BITSIZE (rhs_mode);  if
> > > > > + (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> > > > > +    return false;
> > > > > +
> > > > > +  opt_scalar_mode mode_iter;
> > > > > +  enum tree_code tc1, tc2;
> > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > +
> > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > +    {
> > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > +
> > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > +	break;
> > > > > +
> > > > > +      scalar_mode cvt_mode;
> > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > +	break;
> > > > > +
> > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > > > (arg_type);
> > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > + isUnsigned);
> > > > > +
> > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > +      if (cvt_type == NULL_TREE
> > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > +					     ret_type,
> > > > > +					     cvt_type, &tc1)
> > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > +					     cvt_type,
> > > > > +					     arg_type, &tc2))
> > > > > +	continue;
> > > > > +
> > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > +      gsi_replace (gsi, g, false);
> > > > > +      return true;
> > > > > +    }
> > > > > +  return false;
> > > > > +}
> > > > > +
> > > >
> > > > So the above improve the situation where the target can handle the
> > > > two-step conversion.  It doesn't really allow this to work for too
> > > > large vectors AFAICS (nor does it try pack/unpack for any of the
> > > > conversions).  It also still duplicates code that's in the
> > > > vectorizer.  I think you should be able to use
> > > > supportable_narrowing_operation and possibly even
> > > > supportable_widening_operation (though that needs refatoring to
> > > > avoid the vectorizer internal stmt_vec_info type - possibly simply by gating
> > the respective code on a non-NULL vinfo).  Both support multi-step conversions.
> > > >
> > >
> > > I tried to use supportable_narrowing_operation and I met two questions:
> > >
> > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know
> > >    which optab can help me convert v16qi to v2qi.
> > 
> > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have an equal number of
> > lanes in/out for .CONVERT_VECTOR) it likely outputs a multi-step conversion
> > where you have to look into *INTERM_TYPES and second-guess the operation
> > code to use for the intermediate steps (IIRC the intermediate steps all use either
> > PACK/UNPACK or CONVERT, never FLOAT/FIX).
> >
> 
> I made a mistake in what I said before. I think 
> supportable_narrowing_operation doesn't support v2df->v2qi, it only use 
> VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it require 
> that vectype_in and vectype_out have the same size to return true. I 
> want to make sure I'm doing the right thing, I can build a tmp_type by 
> build_nonstandard_integer_type and get_same_sized_vectype. And use 
> tree_vec_extract to extract v2qi from v16qi after 
> supportable_narrowing_operation.

Yes.  It looks like the vectorizer, when the vector types number of
lanes agree goes the 'NONE' conversion path, checks
supportable_convert_operation and then has open-coded handling for

      /* For conversions between float and integer types try whether
         we can use intermediate signed integer types to support the
         conversion.  */

that means I was wrong in indicating supportable_narrowing_operation
was for element narrowing, it is for number-of-lane "narrowing".

That said, vectorizable_conversion, in the NONE case has handling
that should be split out into a function that's usable also from
vector lowering then so that both vectorization and lowering
handle the same cases.  The interface would be similar to
supportable_narrowing_operation.

> > 
> > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is
> > >    not what I expected, because it only use vec_pack_trunc. I expect it
> > >    can use vcvttpd2dq + vpmovdw.
> > 
> > With -O3 -fno-tree-loop-vectorize that's what you get.  What you see is because
> > of the restriction of the loop vectorizer to work on a single vector size only.
> >
> 
> Yes, it works, but the program runs the NONE part 
> (tree-vect-stmts.cc:5357) instead of the NARROW_DST part 
> (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the 
> code from line:5373 to line:5455 as a function. This avoids duplicating 
> the wheel, and I get the results I'm looking for.

Yeah.

> In addition to wrapping the function. If you are motivated by the fact 
> that our modifications are not generalized enough, I think we can add 
> supportable_narrowing/widening_operation after the current single step 
> VEC_CONVERT (line 1972 and line 2078). It should try to use a single 
> step and then use multiple steps. If you agree, I'd like to remove my 
> changes about indirect conversions for now, and keep only the direct 
> conversions, so that I can merge the three current patches into the 
> trunk first, and then add the change about indirect conversions later.

I think it should go like finding the largest compute_vectype pair
(source/destination) that we can handle either directly or indirectly
via the new function.

Richard.

> BRs,
> Lin
>  
> >
> > > If I can solve the first question and the function be better (maybe
> > > support trunc<vectype_in><vectype_out>), I'd be happy to use it
> > > directly. I prefer my scheme for now. My functions is more like
> > > supportable_convert_operation. Perhaps, we can modify
> > > supportable_narrowing_operation, but I think it should be another
> > > patch, it will influence vectorizer.
> > 
> > But since you are doing a multi-step conversion this is really what
> > supportable_narrowing_operation is about.  I don't think we want to re-invent
> > the wheel here.  Likewise your approach won't get you to use
> > VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current single-
> > step .CONVERT_VECTOR lowering).
> > supportable_narrowing_operation also checks for this.
> > 
> > Richard.
> > 
> >
> > > BRs,
> > > Lin
> > >
> > > >
> > > > >  /* Expand VEC_CONVERT ifn call.  */
> > > > >
> > > > >  static void
> > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > > > (gimple_stmt_iterator *gsi)
> > > > >    else if (ret_elt_bits > arg_elt_bits)
> > > > >      modifier = WIDEN;
> > > > >
> > > > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > > +    {
> > > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > > +      gsi_replace (gsi, g, false);
> > > > > +      return;
> > > > > +    }
> > > > > +
> > > > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > > > +    return;
> > > > > +
> > > > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > > > +    return;
> > > > > +
> > > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > > FLOAT_EXPR))
> > > > >      {
> > > > > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > > -	{
> > > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > > -	  gsi_replace (gsi, g, false);
> > > > > -	  return;
> > > > > -	}
> > > > >        /* Can't use get_compute_type here, as
> > supportable_convert_operation
> > > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > > >        tree vec_compute_type
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
>
Hu, Lin1 June 3, 2024, 9:26 a.m. UTC | #6
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, June 3, 2024 5:03 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> -> float and int <-> float.
> 
> On Mon, 3 Jun 2024, Hu, Lin1 wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Friday, May 31, 2024 8:41 PM
> > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > > ubizjak@gmail.com
> > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for
> > > int -> int, float
> > > -> float and int <-> float.
> > >
> > > On Fri, 31 May 2024, Hu, Lin1 wrote:
> > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <rguenther@suse.de>
> > > > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao
> > > > > <hongtao.liu@intel.com>; ubizjak@gmail.com
> > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn
> > > > > for int -> int, float
> > > > > -> float and int <-> float.
> > > > >
> > > > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > 	PR target/107432
> > > > > > 	* tree-vect-generic.cc
> > > > > > 	(supportable_indirect_narrowing_operation): New function for
> > > > > > 	support indirect narrowing convert.
> > > > > > 	(supportable_indirect_widening_operation): New function for
> > > > > > 	support indirect widening convert.
> > > > > > 	(expand_vector_conversion): Support convert for int -> int,
> > > > > > 	float -> float and int <-> float.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > 	PR target/107432
> > > > > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > > > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > > > > ---
> > > > > > diff --git a/gcc/tree-vect-generic.cc
> > > > > > b/gcc/tree-vect-generic.cc index
> > > > > > ab640096ca2..0bedb53d9f9 100644
> > > > > > --- a/gcc/tree-vect-generic.cc
> > > > > > +++ b/gcc/tree-vect-generic.cc
> > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If
> > > > > > not see #include "gimple-match.h"
> > > > > >  #include "recog.h"		/* FIXME: for insn_data */
> > > > > >  #include "optabs-libfuncs.h"
> > > > > > +#include "cfgloop.h"
> > > > > > +#include "tree-vectorizer.h"
> > > > > >
> > > > > >
> > > > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > > > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > > > >
> > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > +conversion
> > > > > for
> > > > > > +   float <-> int, like double -> char.  */ bool
> > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > > > +					 enum tree_code code,
> > > > > > +					 tree lhs,
> > > > > > +					 tree arg)
> > > > > > +{
> > > > > > +  gimple *g;
> > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > > +  tree new_rhs;
> > > > > > +
> > > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);
> > > > > > + if (code != FIX_TRUNC_EXPR || flag_trapping_math ||
> > > > > > + ret_elt_bits >=
> > > > > arg_elt_bits)
> > > > > > +    return false;
> > > > > > +
> > > > > > +  unsigned short target_size;  scalar_mode tmp_cvt_mode;
> > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER
> > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE;
> > > > > > + tmp_cvt_mode = lhs_mode; target_size = GET_MODE_SIZE
> > > > > > + (rhs_mode);
> > > > > > +
> > > > > > +  opt_scalar_mode mode_iter;
> > > > > > +  enum tree_code tc1, tc2;
> > > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > > +
> > > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > > +    {
> > > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > > +
> > > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > > +	break;
> > > > > > +
> > > > > > +      scalar_mode cvt_mode;
> > > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > > +	break;
> > > > > > +
> > > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) ||
> > > > > > + TYPE_UNSIGNED
> > > > > (arg_type);
> > > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > > + isUnsigned);
> > > > > > +
> > > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > > +      if (cvt_type == NULL_TREE
> > > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > > +					     ret_type,
> > > > > > +					     cvt_type, &tc1)
> > > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > > +					     cvt_type,
> > > > > > +					     arg_type, &tc2))
> > > > > > +	continue;
> > > > > > +
> > > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > > +      gsi_replace (gsi, g, false);
> > > > > > +      return true;
> > > > > > +    }
> > > > > > +  return false;
> > > > > > +}
> > > > > > +
> > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > +conversion
> > > > > for
> > > > > > +   float <-> int, like char -> double.  */ bool
> > > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > > > > +					 enum tree_code code,
> > > > > > +					 tree lhs,
> > > > > > +					 tree arg)
> > > > > > +{
> > > > > > +  gimple *g;
> > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > > +  tree new_rhs;
> > > > > > +
> > > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);
> > > > > > + if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > > > > +    return false;
> > > > > > +
> > > > > > +  unsigned short target_size;  scalar_mode tmp_cvt_mode;
> > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER
> > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE;
> > > > > > + target_size = GET_MODE_SIZE (lhs_mode);  int rhs_size =
> > > > > > + GET_MODE_BITSIZE (rhs_mode);  if (!int_mode_for_size (rhs_size,
> 0).exists (&tmp_cvt_mode))
> > > > > > +    return false;
> > > > > > +
> > > > > > +  opt_scalar_mode mode_iter;
> > > > > > +  enum tree_code tc1, tc2;
> > > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > > +
> > > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > > +    {
> > > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > > +
> > > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > > +	break;
> > > > > > +
> > > > > > +      scalar_mode cvt_mode;
> > > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > > +	break;
> > > > > > +
> > > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) ||
> > > > > > + TYPE_UNSIGNED
> > > > > (arg_type);
> > > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > > + isUnsigned);
> > > > > > +
> > > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > > +      if (cvt_type == NULL_TREE
> > > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > > +					     ret_type,
> > > > > > +					     cvt_type, &tc1)
> > > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > > +					     cvt_type,
> > > > > > +					     arg_type, &tc2))
> > > > > > +	continue;
> > > > > > +
> > > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > > +      gsi_replace (gsi, g, false);
> > > > > > +      return true;
> > > > > > +    }
> > > > > > +  return false;
> > > > > > +}
> > > > > > +
> > > > >
> > > > > So the above improve the situation where the target can handle
> > > > > the two-step conversion.  It doesn't really allow this to work
> > > > > for too large vectors AFAICS (nor does it try pack/unpack for
> > > > > any of the conversions).  It also still duplicates code that's
> > > > > in the vectorizer.  I think you should be able to use
> > > > > supportable_narrowing_operation and possibly even
> > > > > supportable_widening_operation (though that needs refatoring to
> > > > > avoid the vectorizer internal stmt_vec_info type - possibly
> > > > > simply by gating
> > > the respective code on a non-NULL vinfo).  Both support multi-step
> conversions.
> > > > >
> > > >
> > > > I tried to use supportable_narrowing_operation and I met two questions:
> > > >
> > > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know
> > > >    which optab can help me convert v16qi to v2qi.
> > >
> > > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have
> > > an equal number of lanes in/out for .CONVERT_VECTOR) it likely
> > > outputs a multi-step conversion where you have to look into
> > > *INTERM_TYPES and second-guess the operation code to use for the
> > > intermediate steps (IIRC the intermediate steps all use either PACK/UNPACK
> or CONVERT, never FLOAT/FIX).
> > >
> >
> > I made a mistake in what I said before. I think
> > supportable_narrowing_operation doesn't support v2df->v2qi, it only
> > use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it
> > require that vectype_in and vectype_out have the same size to return
> > true. I want to make sure I'm doing the right thing, I can build a
> > tmp_type by build_nonstandard_integer_type and get_same_sized_vectype.
> > And use tree_vec_extract to extract v2qi from v16qi after
> > supportable_narrowing_operation.
> 
> Yes.  It looks like the vectorizer, when the vector types number of lanes agree
> goes the 'NONE' conversion path, checks supportable_convert_operation and
> then has open-coded handling for
> 
>       /* For conversions between float and integer types try whether
>          we can use intermediate signed integer types to support the
>          conversion.  */
> 
> that means I was wrong in indicating supportable_narrowing_operation was for
> element narrowing, it is for number-of-lane "narrowing".
> 
> That said, vectorizable_conversion, in the NONE case has handling that should
> be split out into a function that's usable also from vector lowering then so that
> both vectorization and lowering handle the same cases.  The interface would be
> similar to supportable_narrowing_operation.
> 
> > >
> > > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is
> > > >    not what I expected, because it only use vec_pack_trunc. I expect it
> > > >    can use vcvttpd2dq + vpmovdw.
> > >
> > > With -O3 -fno-tree-loop-vectorize that's what you get.  What you see
> > > is because of the restriction of the loop vectorizer to work on a single vector
> size only.
> > >
> >
> > Yes, it works, but the program runs the NONE part
> > (tree-vect-stmts.cc:5357) instead of the NARROW_DST part
> > (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the
> > code from line:5373 to line:5455 as a function. This avoids
> > duplicating the wheel, and I get the results I'm looking for.
> 
> Yeah.
> 
> > In addition to wrapping the function. If you are motivated by the fact
> > that our modifications are not generalized enough, I think we can add
> > supportable_narrowing/widening_operation after the current single step
> > VEC_CONVERT (line 1972 and line 2078). It should try to use a single
> > step and then use multiple steps. If you agree, I'd like to remove my
> > changes about indirect conversions for now, and keep only the direct
> > conversions, so that I can merge the three current patches into the
> > trunk first, and then add the change about indirect conversions later.
> 
> I think it should go like finding the largest compute_vectype pair
> (source/destination) that we can handle either directly or indirectly via the new
> function.
> 
> Richard.
> 

Thanks, I will wrap the code in the new function and put out a new version of this patch. I have a small question, what does "finding the largest compute_vectype pair" mean? Some piece of code from gcc?

BRs,
Lin

>
> >
> > >
> > > > If I can solve the first question and the function be better
> > > > (maybe support trunc<vectype_in><vectype_out>), I'd be happy to
> > > > use it directly. I prefer my scheme for now. My functions is more
> > > > like supportable_convert_operation. Perhaps, we can modify
> > > > supportable_narrowing_operation, but I think it should be another
> > > > patch, it will influence vectorizer.
> > >
> > > But since you are doing a multi-step conversion this is really what
> > > supportable_narrowing_operation is about.  I don't think we want to
> > > re-invent the wheel here.  Likewise your approach won't get you to
> > > use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current
> > > single- step .CONVERT_VECTOR lowering).
> > > supportable_narrowing_operation also checks for this.
> > >
> > > Richard.
> > >
> > >
> > > > BRs,
> > > > Lin
> > > >
> > > > >
> > > > > >  /* Expand VEC_CONVERT ifn call.  */
> > > > > >
> > > > > >  static void
> > > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > > > > (gimple_stmt_iterator *gsi)
> > > > > >    else if (ret_elt_bits > arg_elt_bits)
> > > > > >      modifier = WIDEN;
> > > > > >
> > > > > > +  if (supportable_convert_operation (code, ret_type, arg_type,
> &code1))
> > > > > > +    {
> > > > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > > > +      gsi_replace (gsi, g, false);
> > > > > > +      return;
> > > > > > +    }
> > > > > > +
> > > > > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > > > > +    return;
> > > > > > +
> > > > > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > > > > +    return;
> > > > > > +
> > > > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > > > FLOAT_EXPR))
> > > > > >      {
> > > > > > -      if (supportable_convert_operation (code, ret_type, arg_type,
> &code1))
> > > > > > -	{
> > > > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > > > -	  gsi_replace (gsi, g, false);
> > > > > > -	  return;
> > > > > > -	}
> > > > > >        /* Can't use get_compute_type here, as
> > > supportable_convert_operation
> > > > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > > > >        tree vec_compute_type
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
Richard Biener June 3, 2024, 9:30 a.m. UTC | #7
On Mon, 3 Jun 2024, Hu, Lin1 wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Monday, June 3, 2024 5:03 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com
> > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> > -> float and int <-> float.
> > 
> > On Mon, 3 Jun 2024, Hu, Lin1 wrote:
> > 
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Friday, May 31, 2024 8:41 PM
> > > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > > > ubizjak@gmail.com
> > > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for
> > > > int -> int, float
> > > > -> float and int <-> float.
> > > >
> > > > On Fri, 31 May 2024, Hu, Lin1 wrote:
> > > >
> > > > > > -----Original Message-----
> > > > > > From: Richard Biener <rguenther@suse.de>
> > > > > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > > > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao
> > > > > > <hongtao.liu@intel.com>; ubizjak@gmail.com
> > > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn
> > > > > > for int -> int, float
> > > > > > -> float and int <-> float.
> > > > > >
> > > > > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > > 	PR target/107432
> > > > > > > 	* tree-vect-generic.cc
> > > > > > > 	(supportable_indirect_narrowing_operation): New function for
> > > > > > > 	support indirect narrowing convert.
> > > > > > > 	(supportable_indirect_widening_operation): New function for
> > > > > > > 	support indirect widening convert.
> > > > > > > 	(expand_vector_conversion): Support convert for int -> int,
> > > > > > > 	float -> float and int <-> float.
> > > > > > >
> > > > > > > gcc/testsuite/ChangeLog:
> > > > > > >
> > > > > > > 	PR target/107432
> > > > > > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > > > > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > > > > > ---
> > > > > > > diff --git a/gcc/tree-vect-generic.cc
> > > > > > > b/gcc/tree-vect-generic.cc index
> > > > > > > ab640096ca2..0bedb53d9f9 100644
> > > > > > > --- a/gcc/tree-vect-generic.cc
> > > > > > > +++ b/gcc/tree-vect-generic.cc
> > > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If
> > > > > > > not see #include "gimple-match.h"
> > > > > > >  #include "recog.h"		/* FIXME: for insn_data */
> > > > > > >  #include "optabs-libfuncs.h"
> > > > > > > +#include "cfgloop.h"
> > > > > > > +#include "tree-vectorizer.h"
> > > > > > >
> > > > > > >
> > > > > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > > > > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > > > > >
> > > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > > +conversion
> > > > > > for
> > > > > > > +   float <-> int, like double -> char.  */ bool
> > > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > > > > +					 enum tree_code code,
> > > > > > > +					 tree lhs,
> > > > > > > +					 tree arg)
> > > > > > > +{
> > > > > > > +  gimple *g;
> > > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > > > +  tree new_rhs;
> > > > > > > +
> > > > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);
> > > > > > > + if (code != FIX_TRUNC_EXPR || flag_trapping_math ||
> > > > > > > + ret_elt_bits >=
> > > > > > arg_elt_bits)
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  unsigned short target_size;  scalar_mode tmp_cvt_mode;
> > > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER
> > > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE;
> > > > > > > + tmp_cvt_mode = lhs_mode; target_size = GET_MODE_SIZE
> > > > > > > + (rhs_mode);
> > > > > > > +
> > > > > > > +  opt_scalar_mode mode_iter;
> > > > > > > +  enum tree_code tc1, tc2;
> > > > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > > > +
> > > > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > > > +    {
> > > > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > > > +
> > > > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > > > +	break;
> > > > > > > +
> > > > > > > +      scalar_mode cvt_mode;
> > > > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > > > +	break;
> > > > > > > +
> > > > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) ||
> > > > > > > + TYPE_UNSIGNED
> > > > > > (arg_type);
> > > > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > > > + isUnsigned);
> > > > > > > +
> > > > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > > > +      if (cvt_type == NULL_TREE
> > > > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > > > +					     ret_type,
> > > > > > > +					     cvt_type, &tc1)
> > > > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > > > +					     cvt_type,
> > > > > > > +					     arg_type, &tc2))
> > > > > > > +	continue;
> > > > > > > +
> > > > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > > > +      gsi_replace (gsi, g, false);
> > > > > > > +      return true;
> > > > > > > +    }
> > > > > > > +  return false;
> > > > > > > +}
> > > > > > > +
> > > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > > +conversion
> > > > > > for
> > > > > > > +   float <-> int, like char -> double.  */ bool
> > > > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > > > > > +					 enum tree_code code,
> > > > > > > +					 tree lhs,
> > > > > > > +					 tree arg)
> > > > > > > +{
> > > > > > > +  gimple *g;
> > > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > > > +  tree new_rhs;
> > > > > > > +
> > > > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);
> > > > > > > + if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  unsigned short target_size;  scalar_mode tmp_cvt_mode;
> > > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER
> > > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE;
> > > > > > > + target_size = GET_MODE_SIZE (lhs_mode);  int rhs_size =
> > > > > > > + GET_MODE_BITSIZE (rhs_mode);  if (!int_mode_for_size (rhs_size,
> > 0).exists (&tmp_cvt_mode))
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  opt_scalar_mode mode_iter;
> > > > > > > +  enum tree_code tc1, tc2;
> > > > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > > > +
> > > > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > > > +    {
> > > > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > > > +
> > > > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > > > +	break;
> > > > > > > +
> > > > > > > +      scalar_mode cvt_mode;
> > > > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > > > +	break;
> > > > > > > +
> > > > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) ||
> > > > > > > + TYPE_UNSIGNED
> > > > > > (arg_type);
> > > > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > > > + isUnsigned);
> > > > > > > +
> > > > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > > > +      if (cvt_type == NULL_TREE
> > > > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > > > +					     ret_type,
> > > > > > > +					     cvt_type, &tc1)
> > > > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > > > +					     cvt_type,
> > > > > > > +					     arg_type, &tc2))
> > > > > > > +	continue;
> > > > > > > +
> > > > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > > > +      gsi_replace (gsi, g, false);
> > > > > > > +      return true;
> > > > > > > +    }
> > > > > > > +  return false;
> > > > > > > +}
> > > > > > > +
> > > > > >
> > > > > > So the above improve the situation where the target can handle
> > > > > > the two-step conversion.  It doesn't really allow this to work
> > > > > > for too large vectors AFAICS (nor does it try pack/unpack for
> > > > > > any of the conversions).  It also still duplicates code that's
> > > > > > in the vectorizer.  I think you should be able to use
> > > > > > supportable_narrowing_operation and possibly even
> > > > > > supportable_widening_operation (though that needs refatoring to
> > > > > > avoid the vectorizer internal stmt_vec_info type - possibly
> > > > > > simply by gating
> > > > the respective code on a non-NULL vinfo).  Both support multi-step
> > conversions.
> > > > > >
> > > > >
> > > > > I tried to use supportable_narrowing_operation and I met two questions:
> > > > >
> > > > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know
> > > > >    which optab can help me convert v16qi to v2qi.
> > > >
> > > > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have
> > > > an equal number of lanes in/out for .CONVERT_VECTOR) it likely
> > > > outputs a multi-step conversion where you have to look into
> > > > *INTERM_TYPES and second-guess the operation code to use for the
> > > > intermediate steps (IIRC the intermediate steps all use either PACK/UNPACK
> > or CONVERT, never FLOAT/FIX).
> > > >
> > >
> > > I made a mistake in what I said before. I think
> > > supportable_narrowing_operation doesn't support v2df->v2qi, it only
> > > use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it
> > > require that vectype_in and vectype_out have the same size to return
> > > true. I want to make sure I'm doing the right thing, I can build a
> > > tmp_type by build_nonstandard_integer_type and get_same_sized_vectype.
> > > And use tree_vec_extract to extract v2qi from v16qi after
> > > supportable_narrowing_operation.
> > 
> > Yes.  It looks like the vectorizer, when the vector types number of lanes agree
> > goes the 'NONE' conversion path, checks supportable_convert_operation and
> > then has open-coded handling for
> > 
> >       /* For conversions between float and integer types try whether
> >          we can use intermediate signed integer types to support the
> >          conversion.  */
> > 
> > that means I was wrong in indicating supportable_narrowing_operation was for
> > element narrowing, it is for number-of-lane "narrowing".
> > 
> > That said, vectorizable_conversion, in the NONE case has handling that should
> > be split out into a function that's usable also from vector lowering then so that
> > both vectorization and lowering handle the same cases.  The interface would be
> > similar to supportable_narrowing_operation.
> > 
> > > >
> > > > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is
> > > > >    not what I expected, because it only use vec_pack_trunc. I expect it
> > > > >    can use vcvttpd2dq + vpmovdw.
> > > >
> > > > With -O3 -fno-tree-loop-vectorize that's what you get.  What you see
> > > > is because of the restriction of the loop vectorizer to work on a single vector
> > size only.
> > > >
> > >
> > > Yes, it works, but the program runs the NONE part
> > > (tree-vect-stmts.cc:5357) instead of the NARROW_DST part
> > > (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the
> > > code from line:5373 to line:5455 as a function. This avoids
> > > duplicating the wheel, and I get the results I'm looking for.
> > 
> > Yeah.
> > 
> > > In addition to wrapping the function. If you are motivated by the fact
> > > that our modifications are not generalized enough, I think we can add
> > > supportable_narrowing/widening_operation after the current single step
> > > VEC_CONVERT (line 1972 and line 2078). It should try to use a single
> > > step and then use multiple steps. If you agree, I'd like to remove my
> > > changes about indirect conversions for now, and keep only the direct
> > > conversions, so that I can merge the three current patches into the
> > > trunk first, and then add the change about indirect conversions later.
> > 
> > I think it should go like finding the largest compute_vectype pair
> > (source/destination) that we can handle either directly or indirectly via the new
> > function.
> > 
> > Richard.
> > 
> 
> Thanks, I will wrap the code in the new function and put out a new 
> version of this patch. I have a small question, what does "finding the 
> largest compute_vectype pair" mean? Some piece of code from gcc?

No, I mean what vector lowering does for .VEC_CONVERT right now,
it uses

      /* Can't use get_compute_type here, as supportable_convert_operation
         doesn't necessarily use an optab and needs two arguments.  */
      tree vec_compute_type
        = type_for_widest_vector_mode (arg_type, mov_optab);

or

      if (optab1)
        compute_type = get_compute_type (code1, optab1, arg_type);

and then expand_vector_piecewise to emit code for say V4SF -> V4QI
from V16SF -> V16QI .VEC_CONVERT.

Richard.

> BRs,
> Lin
> 
> >
> > >
> > > >
> > > > > If I can solve the first question and the function be better
> > > > > (maybe support trunc<vectype_in><vectype_out>), I'd be happy to
> > > > > use it directly. I prefer my scheme for now. My functions is more
> > > > > like supportable_convert_operation. Perhaps, we can modify
> > > > > supportable_narrowing_operation, but I think it should be another
> > > > > patch, it will influence vectorizer.
> > > >
> > > > But since you are doing a multi-step conversion this is really what
> > > > supportable_narrowing_operation is about.  I don't think we want to
> > > > re-invent the wheel here.  Likewise your approach won't get you to
> > > > use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current
> > > > single- step .CONVERT_VECTOR lowering).
> > > > supportable_narrowing_operation also checks for this.
> > > >
> > > > Richard.
> > > >
> > > >
> > > > > BRs,
> > > > > Lin
> > > > >
> > > > > >
> > > > > > >  /* Expand VEC_CONVERT ifn call.  */
> > > > > > >
> > > > > > >  static void
> > > > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > > > > > (gimple_stmt_iterator *gsi)
> > > > > > >    else if (ret_elt_bits > arg_elt_bits)
> > > > > > >      modifier = WIDEN;
> > > > > > >
> > > > > > > +  if (supportable_convert_operation (code, ret_type, arg_type,
> > &code1))
> > > > > > > +    {
> > > > > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > > > > +      gsi_replace (gsi, g, false);
> > > > > > > +      return;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > > > > > +    return;
> > > > > > > +
> > > > > > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > > > > > +    return;
> > > > > > > +
> > > > > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > > > > FLOAT_EXPR))
> > > > > > >      {
> > > > > > > -      if (supportable_convert_operation (code, ret_type, arg_type,
> > &code1))
> > > > > > > -	{
> > > > > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > > > > -	  gsi_replace (gsi, g, false);
> > > > > > > -	  return;
> > > > > > > -	}
> > > > > > >        /* Can't use get_compute_type here, as
> > > > supportable_convert_operation
> > > > > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > > > > >        tree vec_compute_type
> > > > > > >
> > > > > >
> > > > > > --
> > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > > Nuernberg)
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
>
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
+}
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
+}
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
+}
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
+}
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2su);
+}
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
+}
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
+}
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu);
+}
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu);
+}
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
+}
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu);
+}
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu);
+}
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu);
+}
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu);
+}
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu);
+}
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
+}
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
+}
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu);
+}
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu);
+}
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu);
+}
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
+}
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu);
+}
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu);
+}
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
+}
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v8hi);
+}
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi);
+}
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2sf);
+}
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2hf);
+}
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2si);
+}
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si);
+}
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi);
+}
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi);
+}
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi);
+}
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu);
+}
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu);
+}
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu);
+}
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi);
+}
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi);
+}
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi);
+}
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi);
+}
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu);
+}
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu);
+}
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu);
+}
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu);
+}
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi);
+}
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi);
+}
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi);
+}
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi);
+}
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu);
+}
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu);
+}
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu);
+}
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df);
+}
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df);
+}
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df);
+}
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df);
+}
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df);
+}
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df);
+}
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf);
+}
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf);
+}
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf);
+}
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index ab640096ca2..0bedb53d9f9 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@  along with GCC; see the file COPYING3.  If not see
 #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1834,6 +1836,142 @@  do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
   return gimplify_build2 (gsi, code, outer_type, b, c);
 }
 
+/* A subroutine of expand_vector_conversion, support indirect conversion for
+   float <-> int, like double -> char.  */
+bool
+supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
+					 enum tree_code code,
+					 tree lhs,
+					 tree arg)
+{
+  gimple *g;
+  tree ret_type = TREE_TYPE (lhs);
+  tree arg_type = TREE_TYPE (arg);
+  tree new_rhs;
+
+  unsigned int ret_elt_bits = vector_element_bits (ret_type);
+  unsigned int arg_elt_bits = vector_element_bits (arg_type);
+  if (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= arg_elt_bits)
+    return false;
+
+  unsigned short target_size;
+  scalar_mode tmp_cvt_mode;
+  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
+  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
+  tree cvt_type = NULL_TREE;
+  tmp_cvt_mode = lhs_mode;
+  target_size = GET_MODE_SIZE (rhs_mode);
+
+  opt_scalar_mode mode_iter;
+  enum tree_code tc1, tc2;
+  unsigned HOST_WIDE_INT nelts
+    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
+
+  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
+    {
+      tmp_cvt_mode = mode_iter.require ();
+
+      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
+	break;
+
+      scalar_mode cvt_mode;
+      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
+      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
+	break;
+
+      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
+      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
+      cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
+
+      cvt_type = build_vector_type (cvt_type, nelts);
+      if (cvt_type == NULL_TREE
+	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
+					     ret_type,
+					     cvt_type, &tc1)
+	  || !supportable_convert_operation ((tree_code) code,
+					     cvt_type,
+					     arg_type, &tc2))
+	continue;
+
+      new_rhs = make_ssa_name (cvt_type);
+      g = vect_gimple_build (new_rhs, tc2, arg);
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, tc1, new_rhs);
+      gsi_replace (gsi, g, false);
+      return true;
+    }
+  return false;
+}
+
+/* A subroutine of expand_vector_conversion, support indirect conversion for
+   float <-> int, like char -> double.  */
+bool
+supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
+					 enum tree_code code,
+					 tree lhs,
+					 tree arg)
+{
+  gimple *g;
+  tree ret_type = TREE_TYPE (lhs);
+  tree arg_type = TREE_TYPE (arg);
+  tree new_rhs;
+
+  unsigned int ret_elt_bits = vector_element_bits (ret_type);
+  unsigned int arg_elt_bits = vector_element_bits (arg_type);
+  if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
+    return false;
+
+  unsigned short target_size;
+  scalar_mode tmp_cvt_mode;
+  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
+  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
+  tree cvt_type = NULL_TREE;
+  target_size = GET_MODE_SIZE (lhs_mode);
+  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
+  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
+    return false;
+
+  opt_scalar_mode mode_iter;
+  enum tree_code tc1, tc2;
+  unsigned HOST_WIDE_INT nelts
+    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
+
+  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
+    {
+      tmp_cvt_mode = mode_iter.require ();
+
+      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
+	break;
+
+      scalar_mode cvt_mode;
+      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
+      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
+	break;
+
+      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
+      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
+      cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
+
+      cvt_type = build_vector_type (cvt_type, nelts);
+      if (cvt_type == NULL_TREE
+	  || !supportable_convert_operation ((tree_code) code,
+					     ret_type,
+					     cvt_type, &tc1)
+	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
+					     cvt_type,
+					     arg_type, &tc2))
+	continue;
+
+      new_rhs = make_ssa_name (cvt_type);
+      g = vect_gimple_build (new_rhs, tc2, arg);
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, tc1, new_rhs);
+      gsi_replace (gsi, g, false);
+      return true;
+    }
+  return false;
+}
+
 /* Expand VEC_CONVERT ifn call.  */
 
 static void
@@ -1871,14 +2009,21 @@  expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+    {
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return;
+    }
+
+  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
+    return;
+
+  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
+    return;
+
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
-	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
-	}
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type