diff mbox series

vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

Message ID 20240508013823.3599051-1-lin1.hu@intel.com
State New
Headers show
Series vect: generate suitable convert insn for int -> int, float -> float and int <-> float. | expand

Commit Message

Hu, Lin1 May 8, 2024, 1:38 a.m. UTC
Hi, all

This patch aims to optimize __builtin_convertvector. We want the function
can generate more efficient insn for some situations. Like v2si -> v2di.

The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for
trunk?

BRs,
Lin

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc (expand_vector_conversion): Support
	convert for int -> int, float -> float and int <-> float.
	(expand_vector_conversion_no_vec_pack): Check if can convert
	int <-> int, float <-> float and int <-> float, directly.
	Support indirect convert, when direct optab is not supported.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
 gcc/tree-vect-generic.cc                   | 107 +++++++++-
 8 files changed, 918 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

Comments

Hu, Lin1 May 14, 2024, 2:25 a.m. UTC | #1
Do you have any advice?

BRs,
Lin

-----Original Message-----
From: Hu, Lin1 <lin1.hu@intel.com> 
Sent: Wednesday, May 8, 2024 9:38 AM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao <hongtao.liu@intel.com>; ubizjak@gmail.com
Subject: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

Hi, all

This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di.

The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

BRs,
Lin

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc (expand_vector_conversion): Support
	convert for int -> int, float -> float and int <-> float.
	(expand_vector_conversion_no_vec_pack): Check if can convert
	int <-> int, float <-> float and int <-> float, directly.
	Support indirect convert, when direct optab is not supported.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
 gcc/tree-vect-generic.cc                   | 107 +++++++++-
 8 files changed, 918 insertions(+), 6 deletions(-)  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); 
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
+  return __builtin_convertvector((__v2di)a, __v2si); }
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi); }
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi); }
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi); }
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi); }
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi); }
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi); }
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi); }
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); }
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); }
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi); }
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi); }
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi); }
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); }
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi); }
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi); }
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); }
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); }
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) {
+  return __builtin_convertvector((__v2du)a, __v2su); }
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su); }
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su); }
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu); }
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu); }
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); }
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu); }
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu); }
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu); }
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu); }
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu); }
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); }
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); }
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu); }
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu); }
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu); }
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); }
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu); }
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu); }
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); }
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) {
+  return (__m128i)__builtin_convertvector(a, __v4si); }
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si); }
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si); }
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) {
+  return (__m128i)__builtin_convertvector(a, __v4si); }
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si); }
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si); }
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) {
+  return (__m128i)__builtin_convertvector(a, __v8hi); }
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi); }
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2sf); }
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf); }
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf); }
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2hf); }
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf); }
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf); }
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) {
+  return __builtin_convertvector(a, __v4hf); }
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf); }
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) {
+  return __builtin_convertvector(a, __v2df); }
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df); }
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df); }
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) {
+  return __builtin_convertvector(a, __v2df); }
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df); }
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df); }
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) {
+  return __builtin_convertvector(a, __v4sf); }
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf); }
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" 
+} */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2si); }
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si); }
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si); }
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di); }
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di); }
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) {
+  return __builtin_convertvector(a, __v4si); }
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si); }
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si); }
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di); }
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
+-fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 
+} } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 
+} } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
+__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
+__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
+__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
+__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
+__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
+__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
+__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
+__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
+__attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi); }
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi); }
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi); }
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu); }
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu); }
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu); }
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi); }
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi); }
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi); }
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi); }
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu); }
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu); }
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu); }
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu); }
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi); }
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi); }
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi); }
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi); }
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu); }
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu); }
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu); }
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
+-fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
+} } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
+__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
+__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
+__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
+__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
+__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
+__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
+__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
+__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
+__attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df); }
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df); }
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df); }
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df); }
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df); }
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df); }
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf); }
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf); }
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf); }
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf); }
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf); }
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf); }
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf); }
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf); }
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf); }
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf); }
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf); }
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf); }
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf); }
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf); }
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf); }
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf); }
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf); }
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf); }
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ab640096ca2..e14fac9f179 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see  #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1834,6 +1836,102 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
   return gimplify_build2 (gsi, code, outer_type, b, c);  }
 
+/* A subroutine of expand_vector_conversion, support indirect conversion for
+   float <-> int, like char -> double.  */ bool 
+expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi,
+				      enum tree_code code,
+				      tree lhs,
+				      tree arg)
+{
+  gimple *g;
+  tree ret_type = TREE_TYPE (lhs);
+  tree arg_type = TREE_TYPE (arg);
+  tree new_rhs;
+  enum {NARROW, NONE, WIDEN} modifier = NONE;
+  enum tree_code code1 = ERROR_MARK;
+  enum tree_code codecvt1 = ERROR_MARK;
+  bool float_expr_p = code == FLOAT_EXPR;
+
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+    {
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return true;
+    }
+
+  unsigned int ret_elt_bits = vector_element_bits (ret_type);  unsigned 
+ int arg_elt_bits = vector_element_bits (arg_type);  if (ret_elt_bits < 
+ arg_elt_bits)
+    modifier = NARROW;
+  else if (ret_elt_bits > arg_elt_bits)
+    modifier = WIDEN;
+
+  if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier == NARROW)
+       || (code == FLOAT_EXPR && modifier == WIDEN)))
+    {
+      unsigned short target_size;
+      scalar_mode tmp_cvt_mode;
+      scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
+      scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
+      tree cvt_type = NULL_TREE;
+      if (modifier == NARROW)
+	{
+	  tmp_cvt_mode = lhs_mode;
+	  target_size = GET_MODE_SIZE (rhs_mode);
+	}
+      else
+	{
+	  target_size = GET_MODE_SIZE (lhs_mode);
+	  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
+	  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
+	    return false;
+	}
+
+      code1 = float_expr_p ? code : NOP_EXPR;
+      codecvt1 = float_expr_p ? NOP_EXPR : code;
+      opt_scalar_mode mode_iter;
+      enum tree_code tc1, tc2;
+      unsigned HOST_WIDE_INT nelts
+	= constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
+
+      FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
+	{
+	  tmp_cvt_mode = mode_iter.require ();
+
+	  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
+	    break;
+
+	  scalar_mode cvt_mode;
+	  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
+	  if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
+	    break;
+
+	  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
+	  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
+	  cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
+
+	  cvt_type = build_vector_type (cvt_type, nelts);
+	  if (cvt_type == NULL_TREE
+	      || !supportable_convert_operation ((tree_code) code1,
+						 ret_type,
+						 cvt_type, &tc1)
+	      || !supportable_convert_operation ((tree_code) codecvt1,
+						 cvt_type,
+						 arg_type, &tc2))
+	    continue;
+
+	  new_rhs = make_ssa_name (cvt_type);
+	  g = vect_gimple_build (new_rhs, tc2, arg);
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	  g = gimple_build_assign (lhs, tc1, new_rhs);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+    }
+  return false;
+}
+
 /* Expand VEC_CONVERT ifn call.  */
 
 static void
@@ -1871,14 +1969,11 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
+  if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg))
+    return;
+
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
-	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
-	}
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type
--
2.31.1
Richard Biener May 14, 2024, 12:23 p.m. UTC | #2
On Tue, 14 May 2024, Hu, Lin1 wrote:

> Do you have any advice?
> 
> BRs,
> Lin
> 
> -----Original Message-----
> From: Hu, Lin1 <lin1.hu@intel.com> 
> Sent: Wednesday, May 8, 2024 9:38 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Liu, Hongtao <hongtao.liu@intel.com>; ubizjak@gmail.com
> Subject: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> 
> Hi, all
> 
> This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di.
> 
> The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

I don't like the new code to be in a separate function, not integrated
with the existing handling.  Note the existing handling should get, say,
V8DF -> V8SI correct for SSE by splitting the operation into smaller
vectors but your code seems to just handle the cases the vectors are
already properly sized.

Without checking it seems you are basing the code on what the
vectorizer does?  Maybe we should have some common code that
computes intermediate conversion steps supported by the HW
unifying what for example supportable_widening_operation or
supportable_narrowing_operation can do to also cover int <-> float
conversions.

That said, if you don't want to do that please still think about
the core part of tree-vect-generic.cc which is breaking down large
emulated vectors into small supported vectors.

Richard.

> BRs,
> Lin
> 
> gcc/ChangeLog:
> 
> 	PR target/107432
> 	* tree-vect-generic.cc (expand_vector_conversion): Support
> 	convert for int -> int, float -> float and int <-> float.
> 	(expand_vector_conversion_no_vec_pack): Check if can convert
> 	int <-> int, float <-> float and int <-> float, directly.
> 	Support indirect convert, when direct optab is not supported.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR target/107432
> 	* gcc.target/i386/pr107432-1.c: New test.
> 	* gcc.target/i386/pr107432-2.c: Ditto.
> 	* gcc.target/i386/pr107432-3.c: Ditto.
> 	* gcc.target/i386/pr107432-4.c: Ditto.
> 	* gcc.target/i386/pr107432-5.c: Ditto.
> 	* gcc.target/i386/pr107432-6.c: Ditto.
> 	* gcc.target/i386/pr107432-7.c: Ditto.
> ---
>  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
>  gcc/tree-vect-generic.cc                   | 107 +++++++++-
>  8 files changed, 918 insertions(+), 6 deletions(-)  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> new file mode 100644
> index 00000000000..a4f37447eb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> @@ -0,0 +1,234 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } 
> +} */
> +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } 
> +} */
> +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
> +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
> +__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
> +__attribute__ ((__vector_size__ (8)));
> +
> +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); 
> +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); 
> +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); 
> +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); 
> +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); 
> +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
> +  return __builtin_convertvector((__v2di)a, __v2si); }
> +
> +__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
> +
> +__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
> +
> +__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi); }
> +
> +__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi); }
> +
> +__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
> +
> +__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi); }
> +
> +__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi); }
> +
> +__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8di)a, __v8qi); }
> +
> +__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2hi); }
> +
> +__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi); }
> +
> +__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); }
> +
> +__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); }
> +
> +__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2qi); }
> +
> +__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi); }
> +
> +__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi); }
> +
> +__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); }
> +
> +__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector((__v2hi)a, __v2qi); }
> +
> +__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi); }
> +
> +__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); }
> +
> +__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); }
> +
> +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) {
> +  return __builtin_convertvector((__v2du)a, __v2su); }
> +
> +__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4du)a, __v4su); }
> +
> +__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8du)a, __v8su); }
> +
> +__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2hu); }
> +
> +__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4hu); }
> +
> +__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); }
> +
> +__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2qu); }
> +
> +__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4qu); }
> +
> +__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8du)a, __v8qu); }
> +
> +__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2hu); }
> +
> +__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4hu); }
> +
> +__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); }
> +
> +__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); }
> +
> +__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2qu); }
> +
> +__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4qu); }
> +
> +__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8su)a, __v8qu); }
> +
> +__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); }
> +
> +__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
> +{
> +  return __builtin_convertvector((__v2hu)a, __v2qu); }
> +
> +__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hu)a, __v8qu); }
> +
> +__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); }
> +
> +__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> new file mode 100644
> index 00000000000..02ffd811cb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> @@ -0,0 +1,105 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
> +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
> +__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
> +__attribute__ ((__vector_size__ (8)));
> +
> +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di); }
> +
> +__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di); }
> +
> +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di); }
> +
> +__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di); }
> +
> +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di); }
> +
> +__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di); }
> +
> +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) {
> +  return (__m128i)__builtin_convertvector(a, __v4si); }
> +
> +__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si); }
> +
> +__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si); }
> +
> +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) {
> +  return (__m128i)__builtin_convertvector(a, __v4si); }
> +
> +__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si); }
> +
> +__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si); }
> +
> +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) {
> +  return (__m128i)__builtin_convertvector(a, __v8hi); }
> +
> +__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v16hi); }
> +
> +__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector(a, __v32hi); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> new file mode 100644
> index 00000000000..30dc947b6dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> @@ -0,0 +1,55 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
> +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) {
> +  return __builtin_convertvector(a, __v2sf); }
> +
> +__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4sf); }
> +
> +__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8sf); }
> +
> +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) {
> +  return __builtin_convertvector(a, __v2hf); }
> +
> +__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4hf); }
> +
> +__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8hf); }
> +
> +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) {
> +  return __builtin_convertvector(a, __v4hf); }
> +
> +__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8hf); }
> +
> +__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector(a, __v16hf); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> new file mode 100644
> index 00000000000..e537e7349e4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
> +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) {
> +  return __builtin_convertvector(a, __v2df); }
> +
> +__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4df); }
> +
> +__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8df); }
> +
> +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) {
> +  return __builtin_convertvector(a, __v2df); }
> +
> +__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4df); }
> +
> +__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8df); }
> +
> +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) {
> +  return __builtin_convertvector(a, __v4sf); }
> +
> +__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8sf); }
> +
> +__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16sf); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> new file mode 100644
> index 00000000000..5a44ef9f3b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" 
> +} */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
> +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) {
> +  return __builtin_convertvector(a, __v2si); }
> +
> +__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4si); }
> +
> +__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8si); }
> +
> +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4di); }
> +
> +__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8di); }
> +
> +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) {
> +  return __builtin_convertvector(a, __v4si); }
> +
> +__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8si); }
> +
> +__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16si); }
> +
> +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4di); }
> +
> +__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8di); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> new file mode 100644
> index 00000000000..4a68a10b089
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> @@ -0,0 +1,139 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
> +-fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 
> +} } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 
> +} } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } 
> +} */
> +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
> +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
> +__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
> +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
> +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
> +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
> +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
> +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
> +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
> +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
> +__attribute__ ((__vector_size__ (16)));
> +
> +__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qi); }
> +
> +__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qi); }
> +
> +__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qi); }
> +
> +__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qu); }
> +
> +__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qu); }
> +
> +__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qu); }
> +
> +__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qi); }
> +
> +__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qi); }
> +
> +__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qi); }
> +
> +__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qi); }
> +
> +__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qu); }
> +
> +__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qu); }
> +
> +__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qu); }
> +
> +__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qu); }
> +
> +__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qi); }
> +
> +__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qi); }
> +
> +__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qi); }
> +
> +__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qi); }
> +
> +__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qu); }
> +
> +__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qu); }
> +
> +__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qu); }
> +
> +__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qu); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> new file mode 100644
> index 00000000000..0ff5a97ed1a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> @@ -0,0 +1,156 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
> +-fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } 
> +} */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
> +} } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
> +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
> +__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
> +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
> +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
> +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
> +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
> +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
> +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
> +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
> +__attribute__ ((__vector_size__ (16)));
> +
> +__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2df); }
> +
> +__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4df); }
> +
> +__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8df); }
> +
> +__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2df); }
> +
> +__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4df); }
> +
> +__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8df); }
> +
> +__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2sf); }
> +
> +__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4sf); }
> +
> +__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8sf); }
> +
> +__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16sf); }
> +
> +__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2sf); }
> +
> +__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4sf); }
> +
> +__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8sf); }
> +
> +__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16sf); }
> +
> +__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2hf); }
> +
> +__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4hf); }
> +
> +__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8hf); }
> +
> +__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16hf); }
> +
> +__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector((__v32qi)a, __v32hf); }
> +
> +__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2hf); }
> +
> +__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4hf); }
> +
> +__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8hf); }
> +
> +__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16hf); }
> +
> +__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
> +{
> +  return __builtin_convertvector((__v32qu)a, __v32hf); }
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ab640096ca2..e14fac9f179 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see  #include "gimple-match.h"
>  #include "recog.h"		/* FIXME: for insn_data */
>  #include "optabs-libfuncs.h"
> +#include "cfgloop.h"
> +#include "tree-vectorizer.h"
>  
>  
>  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> @@ -1834,6 +1836,102 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
>    return gimplify_build2 (gsi, code, outer_type, b, c);  }
>  
> +/* A subroutine of expand_vector_conversion, support indirect conversion for
> +   float <-> int, like char -> double.  */ bool 
> +expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi,
> +				      enum tree_code code,
> +				      tree lhs,
> +				      tree arg)
> +{
> +  gimple *g;
> +  tree ret_type = TREE_TYPE (lhs);
> +  tree arg_type = TREE_TYPE (arg);
> +  tree new_rhs;
> +  enum {NARROW, NONE, WIDEN} modifier = NONE;
> +  enum tree_code code1 = ERROR_MARK;
> +  enum tree_code codecvt1 = ERROR_MARK;
> +  bool float_expr_p = code == FLOAT_EXPR;
> +
> +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> +    {
> +      g = gimple_build_assign (lhs, code1, arg);
> +      gsi_replace (gsi, g, false);
> +      return true;
> +    }
> +
> +  unsigned int ret_elt_bits = vector_element_bits (ret_type);  unsigned 
> + int arg_elt_bits = vector_element_bits (arg_type);  if (ret_elt_bits < 
> + arg_elt_bits)
> +    modifier = NARROW;
> +  else if (ret_elt_bits > arg_elt_bits)
> +    modifier = WIDEN;
> +
> +  if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier == NARROW)
> +       || (code == FLOAT_EXPR && modifier == WIDEN)))
> +    {
> +      unsigned short target_size;
> +      scalar_mode tmp_cvt_mode;
> +      scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> +      scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> +      tree cvt_type = NULL_TREE;
> +      if (modifier == NARROW)
> +	{
> +	  tmp_cvt_mode = lhs_mode;
> +	  target_size = GET_MODE_SIZE (rhs_mode);
> +	}
> +      else
> +	{
> +	  target_size = GET_MODE_SIZE (lhs_mode);
> +	  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
> +	  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> +	    return false;
> +	}
> +
> +      code1 = float_expr_p ? code : NOP_EXPR;
> +      codecvt1 = float_expr_p ? NOP_EXPR : code;
> +      opt_scalar_mode mode_iter;
> +      enum tree_code tc1, tc2;
> +      unsigned HOST_WIDE_INT nelts
> +	= constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> +
> +      FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> +	{
> +	  tmp_cvt_mode = mode_iter.require ();
> +
> +	  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> +	    break;
> +
> +	  scalar_mode cvt_mode;
> +	  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> +	  if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> +	    break;
> +
> +	  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> +	  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
> +	  cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
> +
> +	  cvt_type = build_vector_type (cvt_type, nelts);
> +	  if (cvt_type == NULL_TREE
> +	      || !supportable_convert_operation ((tree_code) code1,
> +						 ret_type,
> +						 cvt_type, &tc1)
> +	      || !supportable_convert_operation ((tree_code) codecvt1,
> +						 cvt_type,
> +						 arg_type, &tc2))
> +	    continue;
> +
> +	  new_rhs = make_ssa_name (cvt_type);
> +	  g = vect_gimple_build (new_rhs, tc2, arg);
> +	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +	  g = gimple_build_assign (lhs, tc1, new_rhs);
> +	  gsi_replace (gsi, g, false);
> +	  return true;
> +	}
> +    }
> +  return false;
> +}
> +
>  /* Expand VEC_CONVERT ifn call.  */
>  
>  static void
> @@ -1871,14 +1969,11 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
>    else if (ret_elt_bits > arg_elt_bits)
>      modifier = WIDEN;
>  
> +  if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg))
> +    return;
> +
>    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
>      {
> -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> -	{
> -	  g = gimple_build_assign (lhs, code1, arg);
> -	  gsi_replace (gsi, g, false);
> -	  return;
> -	}
>        /* Can't use get_compute_type here, as supportable_convert_operation
>  	 doesn't necessarily use an optab and needs two arguments.  */
>        tree vec_compute_type
> --
> 2.31.1
> 
>
Hu, Lin1 May 15, 2024, 2:30 a.m. UTC | #3
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Tuesday, May 14, 2024 8:23 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: RE: [PATCH] vect: generate suitable convert insn for int -> int, float ->
> float and int <-> float.
> 
> On Tue, 14 May 2024, Hu, Lin1 wrote:
> 
> > Do you have any advice?
> >
> > BRs,
> > Lin
> >
> > -----Original Message-----
> > From: Hu, Lin1 <lin1.hu@intel.com>
> > Sent: Wednesday, May 8, 2024 9:38 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Liu, Hongtao <hongtao.liu@intel.com>; ubizjak@gmail.com
> > Subject: [PATCH] vect: generate suitable convert insn for int -> int, float ->
> float and int <-> float.
> >
> > Hi, all
> >
> > This patch aims to optimize __builtin_convertvector. We want the function
> can generate more efficient insn for some situations. Like v2si -> v2di.
> >
> > The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK
> for trunk?
> 
> I don't like the new code to be in a separate function, not integrated with the
> existing handling.  Note the existing handling should get, say, V8DF -> V8SI
> correct for SSE by splitting the operation into smaller vectors but your code
> seems to just handle the cases the vectors are already properly sized.
>

Yes, my code only handles some cases, but others are handled by the core part of 
tree-vect-generic.cc. I just take care of some special cases up front. So, V8DF -> V8SI
is still split into smaller vectors for SSE.

And for SSE, I have
another patch to expand the available direct optab environment with 
ix86_expand_vec_perm_const_1 (...). This patch hasn't been sent yet. 
I will sending it out together after I modify this patch. This gives an overall view
of my changes to this function.

> 
> Without checking it seems you are basing the code on what the vectorizer does?
> Maybe we should have some common code that computes intermediate
> conversion steps supported by the HW unifying what for example
> supportable_widening_operation or supportable_narrowing_operation can do
> to also cover int <-> float conversions.
>

Yes, my code is based on vectorizable_conversion(...). I will consider to split the function
and  define some new function like your advises to make my code more common.

BRs,
Lin
 
>
> That said, if you don't want to do that please still think about the core part of
> tree-vect-generic.cc which is breaking down large emulated vectors into small
> supported vectors.
> 
> Richard.
> 
> > BRs,
> > Lin
> >
> > gcc/ChangeLog:
> >
> > 	PR target/107432
> > 	* tree-vect-generic.cc (expand_vector_conversion): Support
> > 	convert for int -> int, float -> float and int <-> float.
> > 	(expand_vector_conversion_no_vec_pack): Check if can convert
> > 	int <-> int, float <-> float and int <-> float, directly.
> > 	Support indirect convert, when direct optab is not supported.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	PR target/107432
> > 	* gcc.target/i386/pr107432-1.c: New test.
> > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > ---
> >  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
> gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
> gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
> gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++
> gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
> gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
> >  gcc/tree-vect-generic.cc                   | 107 +++++++++-
> >  8 files changed, 918 insertions(+), 6 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> > new file mode 100644
> > index 00000000000..a4f37447eb4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> > @@ -0,0 +1,234 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> > +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } }
> > +} } */
> > +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } }
> > +} } */
> > +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > +__attribute__ ((__vector_size__ (8)));
> > +
> > +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
> > +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
> > +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
> > +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
> > +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
> > +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
> > +
> > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v2di)a, __v2si); }
> > +
> > +__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
> > +
> > +__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
> > +
> > +__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v2di)a, __v2hi); }
> > +
> > +__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v4di)a, __v4hi); }
> > +
> > +__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
> > +
> > +__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v2di)a, __v2qi); }
> > +
> > +__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v4di)a, __v4qi); }
> > +
> > +__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
> > +{
> > +  return __builtin_convertvector((__v8di)a, __v8qi); }
> > +
> > +__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> > +{
> > +  return __builtin_convertvector((__v2si)a, __v2hi); }
> > +
> > +__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v4si)a, __v4hi); }
> > +
> > +__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); }
> > +
> > +__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); }
> > +
> > +__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> > +{
> > +  return __builtin_convertvector((__v2si)a, __v2qi); }
> > +
> > +__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v4si)a, __v4qi); }
> > +
> > +__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v8si)a, __v8qi); }
> > +
> > +__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); }
> > +
> > +__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> > +{
> > +  return __builtin_convertvector((__v2hi)a, __v2qi); }
> > +
> > +__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v8hi)a, __v8qi); }
> > +
> > +__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); }
> > +
> > +__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); }
> > +
> > +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v2du)a, __v2su); }
> > +
> > +__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v4du)a, __v4su); }
> > +
> > +__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v8du)a, __v8su); }
> > +
> > +__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v2du)a, __v2hu); }
> > +
> > +__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v4du)a, __v4hu); }
> > +
> > +__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); }
> > +
> > +__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v2du)a, __v2qu); }
> > +
> > +__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v4du)a, __v4qu); }
> > +
> > +__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
> > +{
> > +  return __builtin_convertvector((__v8du)a, __v8qu); }
> > +
> > +__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
> > +{
> > +  return __builtin_convertvector((__v2su)a, __v2hu); }
> > +
> > +__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v4su)a, __v4hu); }
> > +
> > +__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); }
> > +
> > +__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); }
> > +
> > +__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
> > +{
> > +  return __builtin_convertvector((__v2su)a, __v2qu); }
> > +
> > +__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v4su)a, __v4qu); }
> > +
> > +__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v8su)a, __v8qu); }
> > +
> > +__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); }
> > +
> > +__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
> > +{
> > +  return __builtin_convertvector((__v2hu)a, __v2qu); }
> > +
> > +__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v8hu)a, __v8qu); }
> > +
> > +__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); }
> > +
> > +__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> > new file mode 100644
> > index 00000000000..02ffd811cb4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> > @@ -0,0 +1,105 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> > +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > +__attribute__ ((__vector_size__ (8)));
> > +
> > +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v4di); }
> > +
> > +__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v8di); }
> > +
> > +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v4di); }
> > +
> > +__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v8di); }
> > +
> > +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v4di); }
> > +
> > +__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v8di); }
> > +
> > +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) {
> > +  return (__m128i)__builtin_convertvector(a, __v4si); }
> > +
> > +__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v8si); }
> > +
> > +__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v16si); }
> > +
> > +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) {
> > +  return (__m128i)__builtin_convertvector(a, __v4si); }
> > +
> > +__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v8si); }
> > +
> > +__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v16si); }
> > +
> > +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) {
> > +  return (__m128i)__builtin_convertvector(a, __v8hi); }
> > +
> > +__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v16hi); }
> > +
> > +__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
> > +{
> > +  return __builtin_convertvector(a, __v32hi); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> > new file mode 100644
> > index 00000000000..30dc947b6dd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> > @@ -0,0 +1,55 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> > +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> > +typedef
> > +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> > +
> > +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) {
> > +  return __builtin_convertvector(a, __v2sf); }
> > +
> > +__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector(a, __v4sf); }
> > +
> > +__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector(a, __v8sf); }
> > +
> > +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) {
> > +  return __builtin_convertvector(a, __v2hf); }
> > +
> > +__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector(a, __v4hf); }
> > +
> > +__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector(a, __v8hf); }
> > +
> > +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) {
> > +  return __builtin_convertvector(a, __v4hf); }
> > +
> > +__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector(a, __v8hf); }
> > +
> > +__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
> > +{
> > +  return __builtin_convertvector(a, __v16hf); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> > new file mode 100644
> > index 00000000000..e537e7349e4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> > @@ -0,0 +1,56 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> > +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> > +typedef
> > +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> > +
> > +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) {
> > +  return __builtin_convertvector(a, __v2df); }
> > +
> > +__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
> > +{
> > +  return __builtin_convertvector(a, __v4df); }
> > +
> > +__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector(a, __v8df); }
> > +
> > +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) {
> > +  return __builtin_convertvector(a, __v2df); }
> > +
> > +__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
> > +{
> > +  return __builtin_convertvector(a, __v4df); }
> > +
> > +__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector(a, __v8df); }
> > +
> > +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) {
> > +  return __builtin_convertvector(a, __v4sf); }
> > +
> > +__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector(a, __v8sf); }
> > +
> > +__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
> > +{
> > +  return __builtin_convertvector(a, __v16sf); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> > new file mode 100644
> > index 00000000000..5a44ef9f3b9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> > @@ -0,0 +1,72 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3"
> > +} */
> > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> > +typedef
> > +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> > +
> > +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) {
> > +  return __builtin_convertvector(a, __v2si); }
> > +
> > +__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector(a, __v4si); }
> > +
> > +__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector(a, __v8si); }
> > +
> > +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
> > +{
> > +  return __builtin_convertvector(a, __v4di); }
> > +
> > +__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector(a, __v8di); }
> > +
> > +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) {
> > +  return __builtin_convertvector(a, __v4si); }
> > +
> > +__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector(a, __v8si); }
> > +
> > +__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
> > +{
> > +  return __builtin_convertvector(a, __v16si); }
> > +
> > +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
> > +{
> > +  return __builtin_convertvector(a, __v4di); }
> > +
> > +__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector(a, __v8di); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> > new file mode 100644
> > index 00000000000..4a68a10b089
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> > @@ -0,0 +1,139 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq
> > +-fno-trapping-math" } */
> > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { !
> > +ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { !
> > +ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> > +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } }
> > +} } */
> > +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef
> > +char __v4qi __attribute__ ((__vector_size__ (4))); typedef char
> > +__v8qi __attribute__ ((__vector_size__ (8))); typedef char __v16qi
> > +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu
> > +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu
> > +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu
> > +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu
> > +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf
> > +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf
> > +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf
> > +__attribute__ ((__vector_size__ (16)));
> > +
> > +__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
> > +{
> > +  return __builtin_convertvector((__v2df)a, __v2qi); }
> > +
> > +__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector((__v4df)a, __v4qi); }
> > +
> > +__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector((__v8df)a, __v8qi); }
> > +
> > +__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
> > +{
> > +  return __builtin_convertvector((__v2df)a, __v2qu); }
> > +
> > +__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector((__v4df)a, __v4qu); }
> > +
> > +__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector((__v8df)a, __v8qu); }
> > +
> > +__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
> > +{
> > +  return __builtin_convertvector((__v2sf)a, __v2qi); }
> > +
> > +__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
> > +{
> > +  return __builtin_convertvector((__v4sf)a, __v4qi); }
> > +
> > +__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector((__v8sf)a, __v8qi); }
> > +
> > +__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
> > +{
> > +  return __builtin_convertvector((__v16sf)a, __v16qi); }
> > +
> > +__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
> > +{
> > +  return __builtin_convertvector((__v2sf)a, __v2qu); }
> > +
> > +__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
> > +{
> > +  return __builtin_convertvector((__v4sf)a, __v4qu); }
> > +
> > +__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector((__v8sf)a, __v8qu); }
> > +
> > +__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
> > +{
> > +  return __builtin_convertvector((__v16sf)a, __v16qu); }
> > +
> > +__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
> > +{
> > +  return __builtin_convertvector((__v2hf)a, __v2qi); }
> > +
> > +__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector((__v8hf)a, __v8qi); }
> > +
> > +__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
> > +{
> > +  return __builtin_convertvector((__v16hf)a, __v16qi); }
> > +
> > +__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
> > +{
> > +  return __builtin_convertvector((__v32hf)a, __v32qi); }
> > +
> > +__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
> > +{
> > +  return __builtin_convertvector((__v2hf)a, __v2qu); }
> > +
> > +__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector((__v8hf)a, __v8qu); }
> > +
> > +__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
> > +{
> > +  return __builtin_convertvector((__v16hf)a, __v16qu); }
> > +
> > +__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
> > +{
> > +  return __builtin_convertvector((__v32hf)a, __v32qu); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> > new file mode 100644
> > index 00000000000..0ff5a97ed1a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> > @@ -0,0 +1,156 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq
> > +-fno-trapping-math" } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } }
> > +} } */
> > +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32
> > +} } } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef
> > +char __v4qi __attribute__ ((__vector_size__ (4))); typedef char
> > +__v8qi __attribute__ ((__vector_size__ (8))); typedef char __v16qi
> > +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu
> > +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu
> > +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu
> > +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu
> > +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf
> > +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf
> > +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf
> > +__attribute__ ((__vector_size__ (16)));
> > +
> > +__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
> > +{
> > +  return __builtin_convertvector((__v2qi)a, __v2df); }
> > +
> > +__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
> > +{
> > +  return __builtin_convertvector((__v4qi)a, __v4df); }
> > +
> > +__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
> > +{
> > +  return __builtin_convertvector((__v8qi)a, __v8df); }
> > +
> > +__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
> > +{
> > +  return __builtin_convertvector((__v2qu)a, __v2df); }
> > +
> > +__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
> > +{
> > +  return __builtin_convertvector((__v4qu)a, __v4df); }
> > +
> > +__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
> > +{
> > +  return __builtin_convertvector((__v8qu)a, __v8df); }
> > +
> > +__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
> > +{
> > +  return __builtin_convertvector((__v2qi)a, __v2sf); }
> > +
> > +__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
> > +{
> > +  return __builtin_convertvector((__v4qi)a, __v4sf); }
> > +
> > +__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
> > +{
> > +  return __builtin_convertvector((__v8qi)a, __v8sf); }
> > +
> > +__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
> > +{
> > +  return __builtin_convertvector((__v16qi)a, __v16sf); }
> > +
> > +__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
> > +{
> > +  return __builtin_convertvector((__v2qu)a, __v2sf); }
> > +
> > +__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
> > +{
> > +  return __builtin_convertvector((__v4qu)a, __v4sf); }
> > +
> > +__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
> > +{
> > +  return __builtin_convertvector((__v8qu)a, __v8sf); }
> > +
> > +__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
> > +{
> > +  return __builtin_convertvector((__v16qu)a, __v16sf); }
> > +
> > +__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
> > +{
> > +  return __builtin_convertvector((__v2qi)a, __v2hf); }
> > +
> > +__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
> > +{
> > +  return __builtin_convertvector((__v4qi)a, __v4hf); }
> > +
> > +__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
> > +{
> > +  return __builtin_convertvector((__v8qi)a, __v8hf); }
> > +
> > +__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
> > +{
> > +  return __builtin_convertvector((__v16qi)a, __v16hf); }
> > +
> > +__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
> > +{
> > +  return __builtin_convertvector((__v32qi)a, __v32hf); }
> > +
> > +__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
> > +{
> > +  return __builtin_convertvector((__v2qu)a, __v2hf); }
> > +
> > +__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
> > +{
> > +  return __builtin_convertvector((__v4qu)a, __v4hf); }
> > +
> > +__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
> > +{
> > +  return __builtin_convertvector((__v8qu)a, __v8hf); }
> > +
> > +__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
> > +{
> > +  return __builtin_convertvector((__v16qu)a, __v16hf); }
> > +
> > +__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
> > +{
> > +  return __builtin_convertvector((__v32qu)a, __v32hf); }
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > ab640096ca2..e14fac9f179 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
> #include "gimple-match.h"
> >  #include "recog.h"		/* FIXME: for insn_data */
> >  #include "optabs-libfuncs.h"
> > +#include "cfgloop.h"
> > +#include "tree-vectorizer.h"
> >
> >
> >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > @@ -1834,6 +1836,102 @@ do_vec_narrow_conversion
> (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> >
> > +/* A subroutine of expand_vector_conversion, support indirect conversion
> for
> > +   float <-> int, like char -> double.  */ bool
> > +expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi,
> > +				      enum tree_code code,
> > +				      tree lhs,
> > +				      tree arg)
> > +{
> > +  gimple *g;
> > +  tree ret_type = TREE_TYPE (lhs);
> > +  tree arg_type = TREE_TYPE (arg);
> > +  tree new_rhs;
> > +  enum {NARROW, NONE, WIDEN} modifier = NONE;
> > +  enum tree_code code1 = ERROR_MARK;
> > +  enum tree_code codecvt1 = ERROR_MARK;
> > +  bool float_expr_p = code == FLOAT_EXPR;
> > +
> > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > +    {
> > +      g = gimple_build_assign (lhs, code1, arg);
> > +      gsi_replace (gsi, g, false);
> > +      return true;
> > +    }
> > +
> > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > + (ret_elt_bits <
> > + arg_elt_bits)
> > +    modifier = NARROW;
> > +  else if (ret_elt_bits > arg_elt_bits)
> > +    modifier = WIDEN;
> > +
> > +  if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier ==
> NARROW)
> > +       || (code == FLOAT_EXPR && modifier == WIDEN)))
> > +    {
> > +      unsigned short target_size;
> > +      scalar_mode tmp_cvt_mode;
> > +      scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > +      scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > +      tree cvt_type = NULL_TREE;
> > +      if (modifier == NARROW)
> > +	{
> > +	  tmp_cvt_mode = lhs_mode;
> > +	  target_size = GET_MODE_SIZE (rhs_mode);
> > +	}
> > +      else
> > +	{
> > +	  target_size = GET_MODE_SIZE (lhs_mode);
> > +	  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
> > +	  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> > +	    return false;
> > +	}
> > +
> > +      code1 = float_expr_p ? code : NOP_EXPR;
> > +      codecvt1 = float_expr_p ? NOP_EXPR : code;
> > +      opt_scalar_mode mode_iter;
> > +      enum tree_code tc1, tc2;
> > +      unsigned HOST_WIDE_INT nelts
> > +	= constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > +
> > +      FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > +	{
> > +	  tmp_cvt_mode = mode_iter.require ();
> > +
> > +	  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > +	    break;
> > +
> > +	  scalar_mode cvt_mode;
> > +	  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > +	  if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > +	    break;
> > +
> > +	  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > +	  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> (arg_type);
> > +	  cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
> > +
> > +	  cvt_type = build_vector_type (cvt_type, nelts);
> > +	  if (cvt_type == NULL_TREE
> > +	      || !supportable_convert_operation ((tree_code) code1,
> > +						 ret_type,
> > +						 cvt_type, &tc1)
> > +	      || !supportable_convert_operation ((tree_code) codecvt1,
> > +						 cvt_type,
> > +						 arg_type, &tc2))
> > +	    continue;
> > +
> > +	  new_rhs = make_ssa_name (cvt_type);
> > +	  g = vect_gimple_build (new_rhs, tc2, arg);
> > +	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +	  g = gimple_build_assign (lhs, tc1, new_rhs);
> > +	  gsi_replace (gsi, g, false);
> > +	  return true;
> > +	}
> > +    }
> > +  return false;
> > +}
> > +
> >  /* Expand VEC_CONVERT ifn call.  */
> >
> >  static void
> > @@ -1871,14 +1969,11 @@ expand_vector_conversion
> (gimple_stmt_iterator *gsi)
> >    else if (ret_elt_bits > arg_elt_bits)
> >      modifier = WIDEN;
> >
> > +  if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg))
> > +    return;
> > +
> >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> FLOAT_EXPR))
> >      {
> > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > -	{
> > -	  g = gimple_build_assign (lhs, code1, arg);
> > -	  gsi_replace (gsi, g, false);
> > -	  return;
> > -	}
> >        /* Can't use get_compute_type here, as supportable_convert_operation
> >  	 doesn't necessarily use an optab and needs two arguments.  */
> >        tree vec_compute_type
> > --
> > 2.31.1
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
+}
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
+}
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
+}
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
+}
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2su);
+}
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
+}
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
+}
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu);
+}
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu);
+}
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
+}
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu);
+}
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu);
+}
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu);
+}
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu);
+}
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu);
+}
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
+}
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
+}
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu);
+}
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu);
+}
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu);
+}
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
+}
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu);
+}
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu);
+}
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
+}
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v8hi);
+}
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi);
+}
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2sf);
+}
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2hf);
+}
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2si);
+}
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si);
+}
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi);
+}
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi);
+}
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi);
+}
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu);
+}
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu);
+}
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu);
+}
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi);
+}
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi);
+}
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi);
+}
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi);
+}
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu);
+}
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu);
+}
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu);
+}
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu);
+}
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi);
+}
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi);
+}
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi);
+}
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi);
+}
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu);
+}
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu);
+}
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu);
+}
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df);
+}
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df);
+}
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df);
+}
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df);
+}
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df);
+}
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df);
+}
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf);
+}
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf);
+}
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf);
+}
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index ab640096ca2..e14fac9f179 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@  along with GCC; see the file COPYING3.  If not see
 #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1834,6 +1836,102 @@  do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
   return gimplify_build2 (gsi, code, outer_type, b, c);
 }
 
+/* A subroutine of expand_vector_conversion, support indirect conversion for
+   float <-> int, like char -> double.  */
+bool
+expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi,
+				      enum tree_code code,
+				      tree lhs,
+				      tree arg)
+{
+  gimple *g;
+  tree ret_type = TREE_TYPE (lhs);
+  tree arg_type = TREE_TYPE (arg);
+  tree new_rhs;
+  enum {NARROW, NONE, WIDEN} modifier = NONE;
+  enum tree_code code1 = ERROR_MARK;
+  enum tree_code codecvt1 = ERROR_MARK;
+  bool float_expr_p = code == FLOAT_EXPR;
+
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+    {
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return true;
+    }
+
+  unsigned int ret_elt_bits = vector_element_bits (ret_type);
+  unsigned int arg_elt_bits = vector_element_bits (arg_type);
+  if (ret_elt_bits < arg_elt_bits)
+    modifier = NARROW;
+  else if (ret_elt_bits > arg_elt_bits)
+    modifier = WIDEN;
+
+  if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier == NARROW)
+       || (code == FLOAT_EXPR && modifier == WIDEN)))
+    {
+      unsigned short target_size;
+      scalar_mode tmp_cvt_mode;
+      scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
+      scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
+      tree cvt_type = NULL_TREE;
+      if (modifier == NARROW)
+	{
+	  tmp_cvt_mode = lhs_mode;
+	  target_size = GET_MODE_SIZE (rhs_mode);
+	}
+      else
+	{
+	  target_size = GET_MODE_SIZE (lhs_mode);
+	  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
+	  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
+	    return false;
+	}
+
+      code1 = float_expr_p ? code : NOP_EXPR;
+      codecvt1 = float_expr_p ? NOP_EXPR : code;
+      opt_scalar_mode mode_iter;
+      enum tree_code tc1, tc2;
+      unsigned HOST_WIDE_INT nelts
+	= constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
+
+      FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
+	{
+	  tmp_cvt_mode = mode_iter.require ();
+
+	  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
+	    break;
+
+	  scalar_mode cvt_mode;
+	  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
+	  if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
+	    break;
+
+	  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
+	  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
+	  cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
+
+	  cvt_type = build_vector_type (cvt_type, nelts);
+	  if (cvt_type == NULL_TREE
+	      || !supportable_convert_operation ((tree_code) code1,
+						 ret_type,
+						 cvt_type, &tc1)
+	      || !supportable_convert_operation ((tree_code) codecvt1,
+						 cvt_type,
+						 arg_type, &tc2))
+	    continue;
+
+	  new_rhs = make_ssa_name (cvt_type);
+	  g = vect_gimple_build (new_rhs, tc2, arg);
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	  g = gimple_build_assign (lhs, tc1, new_rhs);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+    }
+  return false;
+}
+
 /* Expand VEC_CONVERT ifn call.  */
 
 static void
@@ -1871,14 +1969,11 @@  expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
+  if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg))
+    return;
+
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
-	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
-	}
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type