diff mbox series

[59/62] AVX512FP16: Support load/store/abs intrinsics.

Message ID 20210701061648.9447-60-hongtao.liu@intel.com
State New
Headers show
Series Support all AVX512FP16 intrinsics. | expand

Commit Message

liuhongt July 1, 2021, 6:16 a.m. UTC
From: dianhong xu <dianhong.xu@intel.com>

gcc/ChangeLog:

	* config/i386/avx512fp16intrin.h (__m512h_u, __m256h_u,
	__m128h_u): New typedef.
	(_mm512_load_ph): New intrinsic.
	(_mm256_load_ph): Ditto.
	(_mm_load_ph): Ditto.
	(_mm512_loadu_ph): Ditto.
	(_mm256_loadu_ph): Ditto.
	(_mm_loadu_ph): Ditto.
	(_mm512_store_ph): Ditto.
	(_mm256_store_ph): Ditto.
	(_mm_store_ph): Ditto.
	(_mm512_storeu_ph): Ditto.
	(_mm256_storeu_ph): Ditto.
	(_mm_storeu_ph): Ditto.
	(_mm512_abs_ph): Ditto.
	* config/i386/avx512fp16vlintrin.h
	(_mm_abs_ph): Ditto.
	(_mm256_abs_ph): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512fp16-13.c: New test.
---
 gcc/config/i386/avx512fp16intrin.h            |  97 ++++++++++++
 gcc/config/i386/avx512fp16vlintrin.h          |  16 ++
 gcc/testsuite/gcc.target/i386/avx512fp16-13.c | 143 ++++++++++++++++++
 3 files changed, 256 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-13.c

Comments

Hongtao Liu Sept. 22, 2021, 10:30 a.m. UTC | #1
I'm going to check in 4 patches.

[PATCH 59/62] AVX512FP16: Support load/store/abs intrinsics.
[PATCH 60/62] AVX512FP16: Add reduce operators(add/mul/min/max).
[PATCH 61/62] AVX512FP16: Add complex conjugation intrinsic instructions.
[PATCH 62/62] AVX512FP16: Add permutation and mask blend intrinsics.

  Bootstrapped and regtest on x86_64-pc-linux-gnu{-m32,}.
  Newly added runtime tests passed on sde{-m32,}.

On Thu, Jul 1, 2021 at 2:18 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> From: dianhong xu <dianhong.xu@intel.com>
>
> gcc/ChangeLog:
>
>         * config/i386/avx512fp16intrin.h (__m512h_u, __m256h_u,
>         __m128h_u): New typedef.
>         (_mm512_load_ph): New intrinsic.
>         (_mm256_load_ph): Ditto.
>         (_mm_load_ph): Ditto.
>         (_mm512_loadu_ph): Ditto.
>         (_mm256_loadu_ph): Ditto.
>         (_mm_loadu_ph): Ditto.
>         (_mm512_store_ph): Ditto.
>         (_mm256_store_ph): Ditto.
>         (_mm_store_ph): Ditto.
>         (_mm512_storeu_ph): Ditto.
>         (_mm256_storeu_ph): Ditto.
>         (_mm_storeu_ph): Ditto.
>         (_mm512_abs_ph): Ditto.
>         * config/i386/avx512fp16vlintrin.h
>         (_mm_abs_ph): Ditto.
>         (_mm256_abs_ph): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx512fp16-13.c: New test.
> ---
>  gcc/config/i386/avx512fp16intrin.h            |  97 ++++++++++++
>  gcc/config/i386/avx512fp16vlintrin.h          |  16 ++
>  gcc/testsuite/gcc.target/i386/avx512fp16-13.c | 143 ++++++++++++++++++
>  3 files changed, 256 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-13.c
>
> diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
> index 39c10beb1de..b8ca9201828 100644
> --- a/gcc/config/i386/avx512fp16intrin.h
> +++ b/gcc/config/i386/avx512fp16intrin.h
> @@ -45,6 +45,11 @@ typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__));
>  typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__));
>  typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__));
>
> +/* Unaligned version of the same type.  */
> +typedef _Float16 __m128h_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
> +typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32), __may_alias__, __aligned__ (1)));
> +typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
> +
>  extern __inline __m128h
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
> @@ -362,6 +367,48 @@ _mm_load_sh (void const *__P)
>                      *(_Float16 const *) __P);
>  }
>
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_load_ph (void const *__P)
> +{
> +  return *(const __m512h *) __P;
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_load_ph (void const *__P)
> +{
> +  return *(const __m256h *) __P;
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_load_ph (void const *__P)
> +{
> +  return *(const __m128h *) __P;
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_loadu_ph (void const *__P)
> +{
> +  return *(const __m512h_u *) __P;
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_loadu_ph (void const *__P)
> +{
> +  return *(const __m256h_u *) __P;
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_loadu_ph (void const *__P)
> +{
> +  return *(const __m128h_u *) __P;
> +}
> +
>  /* Stores the lower _Float16 value.  */
>  extern __inline void
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> @@ -370,6 +417,56 @@ _mm_store_sh (void *__P, __m128h __A)
>    *(_Float16 *) __P = ((__v8hf)__A)[0];
>  }
>
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_store_ph (void *__P, __m512h __A)
> +{
> +   *(__m512h *) __P = __A;
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_store_ph (void *__P, __m256h __A)
> +{
> +   *(__m256h *) __P = __A;
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_store_ph (void *__P, __m128h __A)
> +{
> +   *(__m128h *) __P = __A;
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_storeu_ph (void *__P, __m512h __A)
> +{
> +   *(__m512h_u *) __P = __A;
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_storeu_ph (void *__P, __m256h __A)
> +{
> +   *(__m256h_u *) __P = __A;
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_storeu_ph (void *__P, __m128h __A)
> +{
> +   *(__m128h_u *) __P = __A;
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_abs_ph(__m512h __A)
> +{
> +  return (__m512h) _mm512_and_epi32 ( _mm512_set1_epi32(0x7FFF7FFF),
> +                                    (__m512i) __A);
> +}
> +
>  /* Intrinsics v[add,sub,mul,div]ph.  */
>  extern __inline __m512h
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
> index c7bdfbc0517..d4aa9928406 100644
> --- a/gcc/config/i386/avx512fp16vlintrin.h
> +++ b/gcc/config/i386/avx512fp16vlintrin.h
> @@ -425,6 +425,22 @@ _mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C)
>                                            _mm256_setzero_ph (), __A);
>  }
>
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_abs_ph (__m128h __A)
> +{
> +  return (__m128h) _mm_and_si128 ( _mm_set1_epi32(0x7FFF7FFF),
> +                                 (__m128i) __A);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_abs_ph (__m256h __A)
> +{
> +  return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32(0x7FFF7FFF),
> +                                    (__m256i) __A);
> +}
> +
>  /* vcmpph */
>  #ifdef __OPTIMIZE
>  extern __inline __mmask8
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-13.c b/gcc/testsuite/gcc.target/i386/avx512fp16-13.c
> new file mode 100644
> index 00000000000..3b6219e493f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-13.c
> @@ -0,0 +1,143 @@
> +/* { dg-do compile} */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> +
> +#include <immintrin.h>
> +void
> +__attribute__ ((noinline, noclone))
> +store512_ph (void *p, __m512h a)
> +{
> +  _mm512_store_ph (p, a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)" 1 } } */
> +
> +void
> +__attribute__ ((noinline, noclone))
> +store256_ph (void *p, __m256h a)
> +{
> +  _mm256_store_ph (p, a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)" 1 } } */
> +
> +void
> +__attribute__ ((noinline, noclone))
> +store_ph (void *p, __m128h a)
> +{
> +  _mm_store_ph (p, a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)" 1 } } */
> +
> +__m512h
> +__attribute__ ((noinline, noclone))
> +load512_ph (void const *p)
> +{
> +  return _mm512_load_ph (p);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)" 1 } } */
> +
> +__m256h
> +__attribute__ ((noinline, noclone))
> +load256_ph (void const *p)
> +{
> +  return _mm256_load_ph (p);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)" 1 } } */
> +
> +__m128h
> +__attribute__ ((noinline, noclone))
> +load_ph (void const *p)
> +{
> +  return _mm_load_ph (p);
> +}
> +/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)" 1 } } */
> +
> +__m512h
> +__attribute__ ((noinline, noclone))
> +load512u_ph (void const *p)
> +{
> +  return _mm512_loadu_ph (p);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^,\]*,\[^\{\n\]*%zmm\[0-9\]" 1 } } */
> +
> +__m256h
> +__attribute__ ((noinline, noclone))
> +load256u_ph (void const *p)
> +{
> +  return _mm256_loadu_ph (p);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^,\]*,\[^\{\n\]*%ymm\[0-9\]" 1 } } */
> +
> +__m128h
> +__attribute__ ((noinline, noclone))
> +load128u_ph (void const *p)
> +{
> +  return _mm_loadu_ph (p);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^,\]*,\[^\{\n\]*%xmm\[0-9\]" 1 } } */
> +
> +void
> +__attribute__ ((noinline, noclone))
> +store512u_ph (void *p, __m512h a)
> +{
> +  return _mm512_storeu_ph (p, a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^\{\n\]*%zmm\[0-9\], *\[^,\]*" 1 } } */
> +
> +void
> +__attribute__ ((noinline, noclone))
> +store256u_ph (void *p, __m256h a)
> +{
> +  return _mm256_storeu_ph (p, a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^\{\n\]*%ymm\[0-9\], *\[^,\]*" 1 } } */
> +
> +void
> +__attribute__ ((noinline, noclone))
> +storeu_ph (void *p, __m128h a)
> +{
> +  return _mm_storeu_ph (p, a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^\{\n\]*%xmm\[0-9\], *\[^,\]*" 1 } } */
> +
> +__m512h
> +__attribute__ ((noinline, noclone))
> +abs512_ph (__m512h a)
> +{
> +  return _mm512_abs_ph (a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vpandd\[ \\t\]+\[^\n\]*\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 { target {! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastd\[^\n\]*%zmm\[0-9\]+" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "vpandd\[^\n\]*%zmm\[0-9\]+" 1 { target ia32 } } } */
> +
> +__m256h
> +__attribute__ ((noinline, noclone))
> +abs256_ph (__m256h a)
> +{
> +  return _mm256_abs_ph (a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vpandq\[ \\t\]+\[^\n\]*\\\{1to\[1-4\]+\\\}, %ymm\[0-9\]+, %ymm0" 1 { target {! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastq\[^\n\]*%ymm\[0-9\]+" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%ymm\[0-9\]+" 1 { target ia32 } } } */
> +
> +__m128h
> +__attribute__ ((noinline, noclone))
> +abs_ph (__m128h a)
> +{
> +  return _mm_abs_ph (a);
> +}
> +
> +/* { dg-final { scan-assembler-times "vpandq\[ \\t\]+\[^\n\]*\\\{1to\[1-2\]+\\\}, %xmm\[0-9\]+, %xmm0" 1 { target {! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastq\[^\n\]*%xmm\[0-9\]+" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%xmm\[0-9\]+" 1 { target ia32 } } } */
> --
> 2.18.1
>
diff mbox series

Patch

diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
index 39c10beb1de..b8ca9201828 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -45,6 +45,11 @@  typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__));
 typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__));
 typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__));
 
+/* Unaligned version of the same type.  */
+typedef _Float16 __m128h_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32), __may_alias__, __aligned__ (1)));
+typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
@@ -362,6 +367,48 @@  _mm_load_sh (void const *__P)
 		     *(_Float16 const *) __P);
 }
 
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_ph (void const *__P)
+{
+  return *(const __m512h *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ph (void const *__P)
+{
+  return *(const __m256h *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ph (void const *__P)
+{
+  return *(const __m128h *) __P;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_ph (void const *__P)
+{
+  return *(const __m512h_u *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ph (void const *__P)
+{
+  return *(const __m256h_u *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ph (void const *__P)
+{
+  return *(const __m128h_u *) __P;
+}
+
 /* Stores the lower _Float16 value.  */
 extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -370,6 +417,56 @@  _mm_store_sh (void *__P, __m128h __A)
   *(_Float16 *) __P = ((__v8hf)__A)[0];
 }
 
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_ph (void *__P, __m512h __A)
+{
+   *(__m512h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ph (void *__P, __m256h __A)
+{
+   *(__m256h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ph (void *__P, __m128h __A)
+{
+   *(__m128h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_ph (void *__P, __m512h __A)
+{
+   *(__m512h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ph (void *__P, __m256h __A)
+{
+   *(__m256h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ph (void *__P, __m128h __A)
+{
+   *(__m128h_u *) __P = __A;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_ph(__m512h __A)
+{
+  return (__m512h) _mm512_and_epi32 ( _mm512_set1_epi32(0x7FFF7FFF),
+				     (__m512i) __A);
+}
+
 /* Intrinsics v[add,sub,mul,div]ph.  */
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
index c7bdfbc0517..d4aa9928406 100644
--- a/gcc/config/i386/avx512fp16vlintrin.h
+++ b/gcc/config/i386/avx512fp16vlintrin.h
@@ -425,6 +425,22 @@  _mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C)
 					   _mm256_setzero_ph (), __A);
 }
 
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_ph (__m128h __A)
+{
+  return (__m128h) _mm_and_si128 ( _mm_set1_epi32(0x7FFF7FFF),
+				  (__m128i) __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_ph (__m256h __A)
+{
+  return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32(0x7FFF7FFF),
+				     (__m256i) __A);
+}
+
 /* vcmpph */
 #ifdef __OPTIMIZE
 extern __inline __mmask8
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-13.c b/gcc/testsuite/gcc.target/i386/avx512fp16-13.c
new file mode 100644
index 00000000000..3b6219e493f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-13.c
@@ -0,0 +1,143 @@ 
+/* { dg-do compile} */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+#include <immintrin.h>
+void
+__attribute__ ((noinline, noclone))
+store512_ph (void *p, __m512h a)
+{
+  _mm512_store_ph (p, a);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)" 1 } } */
+
+void
+__attribute__ ((noinline, noclone))
+store256_ph (void *p, __m256h a)
+{
+  _mm256_store_ph (p, a);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)" 1 } } */
+
+void
+__attribute__ ((noinline, noclone))
+store_ph (void *p, __m128h a)
+{
+  _mm_store_ph (p, a);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)" 1 } } */
+
+__m512h
+__attribute__ ((noinline, noclone))
+load512_ph (void const *p)
+{
+  return _mm512_load_ph (p);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)" 1 } } */
+
+__m256h
+__attribute__ ((noinline, noclone))
+load256_ph (void const *p)
+{
+  return _mm256_load_ph (p);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)" 1 } } */
+
+__m128h
+__attribute__ ((noinline, noclone))
+load_ph (void const *p)
+{
+  return _mm_load_ph (p);
+}
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)" 1 } } */
+
+__m512h
+__attribute__ ((noinline, noclone))
+load512u_ph (void const *p)
+{
+  return _mm512_loadu_ph (p);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^,\]*,\[^\{\n\]*%zmm\[0-9\]" 1 } } */
+
+__m256h
+__attribute__ ((noinline, noclone))
+load256u_ph (void const *p)
+{
+  return _mm256_loadu_ph (p);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^,\]*,\[^\{\n\]*%ymm\[0-9\]" 1 } } */
+
+__m128h
+__attribute__ ((noinline, noclone))
+load128u_ph (void const *p)
+{
+  return _mm_loadu_ph (p);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^,\]*,\[^\{\n\]*%xmm\[0-9\]" 1 } } */
+
+void
+__attribute__ ((noinline, noclone))
+store512u_ph (void *p, __m512h a)
+{
+  return _mm512_storeu_ph (p, a);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^\{\n\]*%zmm\[0-9\], *\[^,\]*" 1 } } */
+
+void
+__attribute__ ((noinline, noclone))
+store256u_ph (void *p, __m256h a)
+{
+  return _mm256_storeu_ph (p, a);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^\{\n\]*%ymm\[0-9\], *\[^,\]*" 1 } } */
+
+void
+__attribute__ ((noinline, noclone))
+storeu_ph (void *p, __m128h a)
+{
+  return _mm_storeu_ph (p, a);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]*\[^\{\n\]*%xmm\[0-9\], *\[^,\]*" 1 } } */
+
+__m512h
+__attribute__ ((noinline, noclone))
+abs512_ph (__m512h a)
+{
+  return _mm512_abs_ph (a);
+}
+
+/* { dg-final { scan-assembler-times "vpandd\[ \\t\]+\[^\n\]*\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 { target {! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastd\[^\n\]*%zmm\[0-9\]+" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vpandd\[^\n\]*%zmm\[0-9\]+" 1 { target ia32 } } } */
+
+__m256h
+__attribute__ ((noinline, noclone))
+abs256_ph (__m256h a)
+{
+  return _mm256_abs_ph (a);
+}
+
+/* { dg-final { scan-assembler-times "vpandq\[ \\t\]+\[^\n\]*\\\{1to\[1-4\]+\\\}, %ymm\[0-9\]+, %ymm0" 1 { target {! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[^\n\]*%ymm\[0-9\]+" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%ymm\[0-9\]+" 1 { target ia32 } } } */
+
+__m128h
+__attribute__ ((noinline, noclone))
+abs_ph (__m128h a)
+{
+  return _mm_abs_ph (a);
+}
+
+/* { dg-final { scan-assembler-times "vpandq\[ \\t\]+\[^\n\]*\\\{1to\[1-2\]+\\\}, %xmm\[0-9\]+, %xmm0" 1 { target {! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[^\n\]*%xmm\[0-9\]+" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%xmm\[0-9\]+" 1 { target ia32 } } } */