Message ID | 20210701061648.9447-8-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Support all AVX512FP16 intrinsics. | expand |
On Thu, Jul 1, 2021 at 2:17 PM liuhongt <hongtao.liu@intel.com> wrote: > > gcc/ChangeLog: > > * config.gcc: Add avx512fp16vlintrin.h. > * config/i386/avx512fp16intrin.h: (_mm512_add_ph): New intrinsic. > (_mm512_mask_add_ph): Likewise. > (_mm512_maskz_add_ph): Likewise. > (_mm512_sub_ph): Likewise. > (_mm512_mask_sub_ph): Likewise. > (_mm512_maskz_sub_ph): Likewise. > (_mm512_mul_ph): Likewise. > (_mm512_mask_mul_ph): Likewise. > (_mm512_maskz_mul_ph): Likewise. > (_mm512_div_ph): Likewise. > (_mm512_mask_div_ph): Likewise. > (_mm512_maskz_div_ph): Likewise. > (_mm512_add_round_ph): Likewise. > (_mm512_mask_add_round_ph): Likewise. > (_mm512_maskz_add_round_ph): Likewise. > (_mm512_sub_round_ph): Likewise. > (_mm512_mask_sub_round_ph): Likewise. > (_mm512_maskz_sub_round_ph): Likewise. > (_mm512_mul_round_ph): Likewise. > (_mm512_mask_mul_round_ph): Likewise. > (_mm512_maskz_mul_round_ph): Likewise. > (_mm512_div_round_ph): Likewise. > (_mm512_mask_div_round_ph): Likewise. > (_mm512_maskz_div_round_ph): Likewise. > * config/i386/avx512fp16vlintrin.h: New header. > * config/i386/i386-builtin-types.def (V16HF, V8HF, V32HF): > Add new builtin types. > * config/i386/i386-builtin.def: Add corresponding builtins. > * config/i386/i386-expand.c > (ix86_expand_args_builtin): Handle new builtin types. > (ix86_expand_round_builtin): Likewise. > * config/i386/immintrin.h: Include avx512fp16vlintrin.h > * config/i386/sse.md (VFH): New mode_iterator. > (VF2H): Likewise. > (avx512fmaskmode): Add HF vector modes. > (avx512fmaskhalfmode): Likewise. > (<plusminus_insn><mode>3<mask_name><round_name>): Adjust to for > HF vector modes. > (*<plusminus_insn><mode>3<mask_name><round_name>): Likewise. > (mul<mode>3<mask_name><round_name>): Likewise. > (*mul<mode>3<mask_name><round_name>): Likewise. > (div<mode>3): Likewise. > (<sse>_div<mode>3<mask_name><round_name>): Likewise. > * config/i386/subst.md (SUBST_V): Add HF vector modes. > (SUBST_A): Likewise. > (round_mode512bit_condition): Adjust for V32HFmode. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx-1.c: Add -mavx512vl and test for new intrinsics. > * gcc.target/i386/avx-2.c: Add -mavx512vl. > * gcc.target/i386/avx512fp16-11a.c: New test. > * gcc.target/i386/avx512fp16-11b.c: Ditto. > * gcc.target/i386/avx512vlfp16-11a.c: Ditto. > * gcc.target/i386/avx512vlfp16-11b.c: Ditto. > * gcc.target/i386/sse-13.c: Add test for new builtins. > * gcc.target/i386/sse-23.c: Ditto. > * gcc.target/i386/sse-14.c: Add test for new intrinsics. > * gcc.target/i386/sse-22.c: Ditto. I'm going to check in 2 patches: this patch and [1] which contains testcase for this patch. Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. Newly added runtime tests passed under sde. [1]https://gcc.gnu.org/pipermail/gcc-patches/2021-July/574125.html > --- > gcc/config.gcc | 2 +- > gcc/config/i386/avx512fp16intrin.h | 251 ++++++++++++++++++ > gcc/config/i386/avx512fp16vlintrin.h | 219 +++++++++++++++ > gcc/config/i386/i386-builtin-types.def | 7 + > gcc/config/i386/i386-builtin.def | 20 ++ > gcc/config/i386/i386-expand.c | 5 + > gcc/config/i386/immintrin.h | 2 + > gcc/config/i386/sse.md | 62 +++-- > gcc/config/i386/subst.md | 6 +- > gcc/testsuite/gcc.target/i386/avx-1.c | 8 +- > gcc/testsuite/gcc.target/i386/avx-2.c | 2 +- > .../gcc.target/i386/avx512fp16-11a.c | 36 +++ > .../gcc.target/i386/avx512fp16-11b.c | 75 ++++++ > .../gcc.target/i386/avx512vlfp16-11a.c | 68 +++++ > .../gcc.target/i386/avx512vlfp16-11b.c | 96 +++++++ > gcc/testsuite/gcc.target/i386/sse-13.c | 6 + > gcc/testsuite/gcc.target/i386/sse-14.c | 14 + > gcc/testsuite/gcc.target/i386/sse-22.c | 14 + > gcc/testsuite/gcc.target/i386/sse-23.c | 6 + > 19 files changed, 872 insertions(+), 27 deletions(-) > create mode 100644 gcc/config/i386/avx512fp16vlintrin.h > create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-11a.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-11b.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c > > diff --git a/gcc/config.gcc b/gcc/config.gcc > index 5b4f894185a..d64a8b9407e 100644 > --- a/gcc/config.gcc > +++ b/gcc/config.gcc > @@ -416,7 +416,7 @@ i[34567]86-*-* | x86_64-*-*) > tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h > amxbf16intrin.h x86gprintrin.h uintrintrin.h > hresetintrin.h keylockerintrin.h avxvnniintrin.h > - mwaitintrin.h avx512fp16intrin.h" > + mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h" > ;; > ia64-*-*) > extra_headers=ia64intrin.h > diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h > index 3fc0770986e..3e9d676dc39 100644 > --- a/gcc/config/i386/avx512fp16intrin.h > +++ b/gcc/config/i386/avx512fp16intrin.h > @@ -217,6 +217,257 @@ _mm_store_sh (void *__P, __m128h __A) > *(_Float16 *) __P = ((__v8hf)__A)[0]; > } > > +/* Intrinsics v[add,sub,mul,div]ph. */ > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_add_ph (__m512h __A, __m512h __B) > +{ > + return (__m512h) ((__v32hf) __A + (__v32hf) __B); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) > +{ > + return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C) > +{ > + return __builtin_ia32_vaddph_v32hf_mask (__B, __C, > + _mm512_setzero_ph (), __A); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_sub_ph (__m512h __A, __m512h __B) > +{ > + return (__m512h) ((__v32hf) __A - (__v32hf) __B); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) > +{ > + return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C) > +{ > + return __builtin_ia32_vsubph_v32hf_mask (__B, __C, > + _mm512_setzero_ph (), __A); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mul_ph (__m512h __A, __m512h __B) > +{ > + return (__m512h) ((__v32hf) __A * (__v32hf) __B); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) > +{ > + return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C) > +{ > + return __builtin_ia32_vmulph_v32hf_mask (__B, __C, > + _mm512_setzero_ph (), __A); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_div_ph (__m512h __A, __m512h __B) > +{ > + return (__m512h) ((__v32hf) __A / (__v32hf) __B); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) > +{ > + return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C) > +{ > + return __builtin_ia32_vdivph_v32hf_mask (__B, __C, > + _mm512_setzero_ph (), __A); > +} > + > +#ifdef __OPTIMIZE__ > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C) > +{ > + return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B, > + _mm512_setzero_ph (), > + (__mmask32) -1, __C); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C, > + __m512h __D, const int __E) > +{ > + return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C, > + const int __D) > +{ > + return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C, > + _mm512_setzero_ph (), > + __A, __D); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C) > +{ > + return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B, > + _mm512_setzero_ph (), > + (__mmask32) -1, __C); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C, > + __m512h __D, const int __E) > +{ > + return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C, > + const int __D) > +{ > + return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C, > + _mm512_setzero_ph (), > + __A, __D); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C) > +{ > + return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B, > + _mm512_setzero_ph (), > + (__mmask32) -1, __C); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C, > + __m512h __D, const int __E) > +{ > + return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C, > + const int __D) > +{ > + return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C, > + _mm512_setzero_ph (), > + __A, __D); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C) > +{ > + return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B, > + _mm512_setzero_ph (), > + (__mmask32) -1, __C); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C, > + __m512h __D, const int __E) > +{ > + return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C, > + const int __D) > +{ > + return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C, > + _mm512_setzero_ph (), > + __A, __D); > +} > +#else > +#define _mm512_add_round_ph(A, B, C) \ > + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B), \ > + _mm512_setzero_ph (),\ > + (__mmask32)-1, (C))) > + > +#define _mm512_mask_add_round_ph(A, B, C, D, E) \ > + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E))) > + > +#define _mm512_maskz_add_round_ph(A, B, C, D) \ > + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C), \ > + _mm512_setzero_ph (),\ > + (A), (D))) > + > +#define _mm512_sub_round_ph(A, B, C) \ > + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B), \ > + _mm512_setzero_ph (),\ > + (__mmask32)-1, (C))) > + > +#define _mm512_mask_sub_round_ph(A, B, C, D, E) \ > + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E))) > + > +#define _mm512_maskz_sub_round_ph(A, B, C, D) \ > + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C), \ > + _mm512_setzero_ph (),\ > + (A), (D))) > + > +#define _mm512_mul_round_ph(A, B, C) \ > + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B), \ > + _mm512_setzero_ph (),\ > + (__mmask32)-1, (C))) > + > +#define _mm512_mask_mul_round_ph(A, B, C, D, E) \ > + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E))) > + > +#define _mm512_maskz_mul_round_ph(A, B, C, D) \ > + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C), \ > + _mm512_setzero_ph (),\ > + (A), (D))) > + > +#define _mm512_div_round_ph(A, B, C) \ > + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B), \ > + _mm512_setzero_ph (),\ > + (__mmask32)-1, (C))) > + > +#define _mm512_mask_div_round_ph(A, B, C, D, E) \ > + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E))) > + > +#define _mm512_maskz_div_round_ph(A, B, C, D) \ > + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C), \ > + _mm512_setzero_ph (),\ > + (A), (D))) > +#endif /* __OPTIMIZE__ */ > + > #ifdef __DISABLE_AVX512FP16__ > #undef __DISABLE_AVX512FP16__ > #pragma GCC pop_options > diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h > new file mode 100644 > index 00000000000..75fa9eb29e7 > --- /dev/null > +++ b/gcc/config/i386/avx512fp16vlintrin.h > @@ -0,0 +1,219 @@ > +/* Copyright (C) 2019 Free Software Foundation, Inc. > + > + This file is part of GCC. > + > + GCC is free software; you can redistribute it and/or modify > + it under the terms of the GNU General Public License as published by > + the Free Software Foundation; either version 3, or (at your option) > + any later version. > + > + GCC is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + GNU General Public License for more details. > + > + Under Section 7 of GPL version 3, you are granted additional > + permissions described in the GCC Runtime Library Exception, version > + 3.1, as published by the Free Software Foundation. > + > + You should have received a copy of the GNU General Public License and > + a copy of the GCC Runtime Library Exception along with this program; > + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#ifndef _IMMINTRIN_H_INCLUDED > +#error "Never use <avx512fp16vlintrin.h> directly; include <immintrin.h> instead." > +#endif > + > +#ifndef __AVX512FP16VLINTRIN_H_INCLUDED > +#define __AVX512FP16VLINTRIN_H_INCLUDED > + > +#if !defined(__AVX512VL__) || !defined(__AVX512FP16__) > +#pragma GCC push_options > +#pragma GCC target("avx512fp16,avx512vl") > +#define __DISABLE_AVX512FP16VL__ > +#endif /* __AVX512FP16VL__ */ > + > +/* Intrinsics v[add,sub,mul,div]ph. */ > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_add_ph (__m128h __A, __m128h __B) > +{ > + return (__m128h) ((__v8hf) __A + (__v8hf) __B); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_add_ph (__m256h __A, __m256h __B) > +{ > + return (__m256h) ((__v16hf) __A + (__v16hf) __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > +{ > + return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) > +{ > + return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C) > +{ > + return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (), > + __A); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C) > +{ > + return __builtin_ia32_vaddph_v16hf_mask (__B, __C, > + _mm256_setzero_ph (), __A); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_sub_ph (__m128h __A, __m128h __B) > +{ > + return (__m128h) ((__v8hf) __A - (__v8hf) __B); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_sub_ph (__m256h __A, __m256h __B) > +{ > + return (__m256h) ((__v16hf) __A - (__v16hf) __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > +{ > + return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) > +{ > + return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C) > +{ > + return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (), > + __A); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C) > +{ > + return __builtin_ia32_vsubph_v16hf_mask (__B, __C, > + _mm256_setzero_ph (), __A); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mul_ph (__m128h __A, __m128h __B) > +{ > + return (__m128h) ((__v8hf) __A * (__v8hf) __B); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mul_ph (__m256h __A, __m256h __B) > +{ > + return (__m256h) ((__v16hf) __A * (__v16hf) __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > +{ > + return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) > +{ > + return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C) > +{ > + return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (), > + __A); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C) > +{ > + return __builtin_ia32_vmulph_v16hf_mask (__B, __C, > + _mm256_setzero_ph (), __A); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_div_ph (__m128h __A, __m128h __B) > +{ > + return (__m128h) ((__v8hf) __A / (__v8hf) __B); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_div_ph (__m256h __A, __m256h __B) > +{ > + return (__m256h) ((__v16hf) __A / (__v16hf) __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > +{ > + return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) > +{ > + return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C) > +{ > + return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (), > + __A); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C) > +{ > + return __builtin_ia32_vdivph_v16hf_mask (__B, __C, > + _mm256_setzero_ph (), __A); > +} > + > +#ifdef __DISABLE_AVX512FP16VL__ > +#undef __DISABLE_AVX512FP16VL__ > +#pragma GCC pop_options > +#endif /* __DISABLE_AVX512FP16VL__ */ > + > +#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */ > diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def > index eb5153002ae..ee3b8c30589 100644 > --- a/gcc/config/i386/i386-builtin-types.def > +++ b/gcc/config/i386/i386-builtin-types.def > @@ -98,6 +98,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI) > # AVX vectors > DEF_VECTOR_TYPE (V4DF, DOUBLE) > DEF_VECTOR_TYPE (V8SF, FLOAT) > +DEF_VECTOR_TYPE (V16HF, FLOAT16) > DEF_VECTOR_TYPE (V4DI, DI) > DEF_VECTOR_TYPE (V8SI, SI) > DEF_VECTOR_TYPE (V16HI, HI) > @@ -108,6 +109,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI) > > # AVX512F vectors > DEF_VECTOR_TYPE (V32SF, FLOAT) > +DEF_VECTOR_TYPE (V32HF, FLOAT16) > DEF_VECTOR_TYPE (V16SF, FLOAT) > DEF_VECTOR_TYPE (V8DF, DOUBLE) > DEF_VECTOR_TYPE (V8DI, DI) > @@ -1302,3 +1304,8 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) > > # FP16 builtins > DEF_FUNCTION_TYPE (V8HF, V8HI) > +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI) > +DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI) > +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT) > +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI) > +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT) > diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def > index 1cc0cc6968c..b783d266dd8 100644 > --- a/gcc/config/i386/i386-builtin.def > +++ b/gcc/config/i386/i386-builtin.def > @@ -2774,6 +2774,20 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builti > BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) > BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) > > +/* AVX512FP16. */ > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) > + > /* Builtins with rounding support. */ > BDESC_END (ARGS, ROUND_ARGS) > > @@ -2973,6 +2987,12 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fixuns_truncv8dfv8di2_mask_round, " > BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "__builtin_ia32_rangeps512_mask", IX86_BUILTIN_RANGEPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT) > BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT) > > +/* AVX512FP16. */ > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) > + > BDESC_END (ROUND_ARGS, MULTI_ARG) > > /* FMA4 and XOP. */ > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index 5ce7163b241..39647eb2cf1 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -9760,6 +9760,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, > case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI: > case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI: > case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI: > + case V32HF_FTYPE_V32HF_V32HF_V32HF_USI: > case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI: > case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI: > case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI: > @@ -9777,6 +9778,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, > case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI: > case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI: > case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI: > + case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI: > case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI: > case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI: > case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI: > @@ -9784,6 +9786,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, > case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI: > case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI: > case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI: > + case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI: > case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI: > case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI: > case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI: > @@ -10460,6 +10463,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, > case INT_FTYPE_V4SF_INT: > nargs = 2; > break; > + case V32HF_FTYPE_V32HF_V32HF_INT: > case V4SF_FTYPE_V4SF_UINT_INT: > case V4SF_FTYPE_V4SF_UINT64_INT: > case V2DF_FTYPE_V2DF_UINT64_INT: > @@ -10500,6 +10504,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, > case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: > case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: > case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: > + case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT: > case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: > case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: > case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT: > diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h > index 5344e22c9c8..e08efb9dff3 100644 > --- a/gcc/config/i386/immintrin.h > +++ b/gcc/config/i386/immintrin.h > @@ -96,6 +96,8 @@ > > #include <avx512fp16intrin.h> > > +#include <avx512fp16vlintrin.h> > + > #include <shaintrin.h> > > #include <fmaintrin.h> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 1009d656cbb..2c1b6fbcd86 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -295,6 +295,13 @@ (define_mode_iterator VF > [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) > > +(define_mode_iterator VFH > + [(V32HF "TARGET_AVX512FP16") > + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") > + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") > + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) > + > ;; 128- and 256-bit float vector modes > (define_mode_iterator VF_128_256 > [(V8SF "TARGET_AVX") V4SF > @@ -318,6 +325,13 @@ (define_mode_iterator VF1_128_256VL > (define_mode_iterator VF2 > [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) > > +;; All DFmode & HFmode vector float modes > +(define_mode_iterator VF2H > + [(V32HF "TARGET_AVX512FP16") > + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") > + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") > + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) > + > ;; 128- and 256-bit DF vector modes > (define_mode_iterator VF2_128_256 > [(V4DF "TARGET_AVX") V2DF]) > @@ -824,6 +838,7 @@ (define_mode_attr avx512fmaskmode > (V32HI "SI") (V16HI "HI") (V8HI "QI") (V4HI "QI") > (V16SI "HI") (V8SI "QI") (V4SI "QI") > (V8DI "QI") (V4DI "QI") (V2DI "QI") > + (V32HF "SI") (V16HF "HI") (V8HF "QI") > (V16SF "HI") (V8SF "QI") (V4SF "QI") > (V8DF "QI") (V4DF "QI") (V2DF "QI")]) > > @@ -842,6 +857,7 @@ (define_mode_attr avx512fmaskhalfmode > (V32HI "HI") (V16HI "QI") (V8HI "QI") (V4HI "QI") > (V16SI "QI") (V8SI "QI") (V4SI "QI") > (V8DI "QI") (V4DI "QI") (V2DI "QI") > + (V32HF "HI") (V16HF "QI") (V8HF "QI") > (V16SF "QI") (V8SF "QI") (V4SF "QI") > (V8DF "QI") (V4DF "QI") (V2DF "QI")]) > > @@ -1940,18 +1956,18 @@ (define_insn_and_split "*nabs<mode>2" > [(set_attr "isa" "noavx,noavx,avx,avx")]) > > (define_expand "<insn><mode>3<mask_name><round_name>" > - [(set (match_operand:VF 0 "register_operand") > - (plusminus:VF > - (match_operand:VF 1 "<round_nimm_predicate>") > - (match_operand:VF 2 "<round_nimm_predicate>")))] > + [(set (match_operand:VFH 0 "register_operand") > + (plusminus:VFH > + (match_operand:VFH 1 "<round_nimm_predicate>") > + (match_operand:VFH 2 "<round_nimm_predicate>")))] > "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" > "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") > > (define_insn "*<insn><mode>3<mask_name><round_name>" > - [(set (match_operand:VF 0 "register_operand" "=x,v") > - (plusminus:VF > - (match_operand:VF 1 "<bcst_round_nimm_predicate>" "<comm>0,v") > - (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] > + [(set (match_operand:VFH 0 "register_operand" "=x,v") > + (plusminus:VFH > + (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "<comm>0,v") > + (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] > "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) > && <mask_mode512bit_condition> && <round_mode512bit_condition>" > "@ > @@ -2002,18 +2018,18 @@ (define_insn "<sse>_vm<insn><mode>3<mask_scalar_name><round_scalar_name>" > (set_attr "mode" "<ssescalarmode>")]) > > (define_expand "mul<mode>3<mask_name><round_name>" > - [(set (match_operand:VF 0 "register_operand") > - (mult:VF > - (match_operand:VF 1 "<round_nimm_predicate>") > - (match_operand:VF 2 "<round_nimm_predicate>")))] > + [(set (match_operand:VFH 0 "register_operand") > + (mult:VFH > + (match_operand:VFH 1 "<round_nimm_predicate>") > + (match_operand:VFH 2 "<round_nimm_predicate>")))] > "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" > "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);") > > (define_insn "*mul<mode>3<mask_name><round_name>" > - [(set (match_operand:VF 0 "register_operand" "=x,v") > - (mult:VF > - (match_operand:VF 1 "<bcst_round_nimm_predicate>" "%0,v") > - (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] > + [(set (match_operand:VFH 0 "register_operand" "=x,v") > + (mult:VFH > + (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "%0,v") > + (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] > "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands) > && <mask_mode512bit_condition> && <round_mode512bit_condition>" > "@ > @@ -2067,9 +2083,9 @@ (define_insn "<sse>_vm<multdiv_mnemonic><mode>3<mask_scalar_name><round_scalar_n > (set_attr "mode" "<ssescalarmode>")]) > > (define_expand "div<mode>3" > - [(set (match_operand:VF2 0 "register_operand") > - (div:VF2 (match_operand:VF2 1 "register_operand") > - (match_operand:VF2 2 "vector_operand")))] > + [(set (match_operand:VF2H 0 "register_operand") > + (div:VF2H (match_operand:VF2H 1 "register_operand") > + (match_operand:VF2H 2 "vector_operand")))] > "TARGET_SSE2") > > (define_expand "div<mode>3" > @@ -2090,10 +2106,10 @@ (define_expand "div<mode>3" > }) > > (define_insn "<sse>_div<mode>3<mask_name><round_name>" > - [(set (match_operand:VF 0 "register_operand" "=x,v") > - (div:VF > - (match_operand:VF 1 "register_operand" "0,v") > - (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] > + [(set (match_operand:VFH 0 "register_operand" "=x,v") > + (div:VFH > + (match_operand:VFH 1 "register_operand" "0,v") > + (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] > "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" > "@ > div<ssemodesuffix>\t{%2, %0|%0, %2} > diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md > index 477a89803fa..762383bfd11 100644 > --- a/gcc/config/i386/subst.md > +++ b/gcc/config/i386/subst.md > @@ -24,6 +24,7 @@ (define_mode_iterator SUBST_V > V32HI V16HI V8HI > V16SI V8SI V4SI > V8DI V4DI V2DI > + V32HF V16HF V8HF > V16SF V8SF V4SF > V8DF V4DF V2DF]) > > @@ -35,6 +36,7 @@ (define_mode_iterator SUBST_A > V32HI V16HI V8HI > V16SI V8SI V4SI > V8DI V4DI V2DI > + V32HF V16HF V8HF > V16SF V8SF V4SF > V8DF V4DF V2DF > QI HI SI DI SF DF]) > @@ -142,7 +144,9 @@ (define_subst_attr "round_prefix" "round" "vex" "evex") > (define_subst_attr "round_mode512bit_condition" "round" "1" "(<MODE>mode == V16SFmode > || <MODE>mode == V8DFmode > || <MODE>mode == V8DImode > - || <MODE>mode == V16SImode)") > + || <MODE>mode == V16SImode > + || <MODE>mode == V32HFmode)") > + > (define_subst_attr "round_modev8sf_condition" "round" "1" "(<MODE>mode == V8SFmode)") > (define_subst_attr "round_modev4sf_condition" "round" "1" "(<MODE>mode == V4SFmode)") > (define_subst_attr "round_codefor" "round" "*" "") > diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c > index f3676077743..1eaee861141 100644 > --- a/gcc/testsuite/gcc.target/i386/avx-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx-1.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16" } */ > +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16 -mavx512vl" } */ > /* { dg-add-options bind_pic_locally } */ > > #include <mm_malloc.h> > @@ -685,6 +685,12 @@ > #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1) > #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E) __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E) > > +/* avx512fp16intrin.h */ > +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8) > + > /* vpclmulqdqintrin.h */ > #define __builtin_ia32_vpclmulqdq_v4di(A, B, C) __builtin_ia32_vpclmulqdq_v4di(A, B, 1) > #define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1) > diff --git a/gcc/testsuite/gcc.target/i386/avx-2.c b/gcc/testsuite/gcc.target/i386/avx-2.c > index 1751c52565c..642ae4d7bfb 100644 > --- a/gcc/testsuite/gcc.target/i386/avx-2.c > +++ b/gcc/testsuite/gcc.target/i386/avx-2.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16" } */ > +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16 -mavx512vl" } */ > /* { dg-add-options bind_pic_locally } */ > > #include <mm_malloc.h> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c > new file mode 100644 > index 00000000000..28492fa3f7b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c > @@ -0,0 +1,36 @@ > +/* { dg-do compile} */ > +/* { dg-options "-O2 -mavx512fp16" } */ > + > +#include <immintrin.h> > +__m512h > +__attribute__ ((noinline, noclone)) > +vadd512 (__m512h a, __m512h b) > +{ > + return a + b; > +} > + > +__m512h > +__attribute__ ((noinline, noclone)) > +vsub512 (__m512h a, __m512h b) > +{ > + return a - b; > +} > + > +__m512h > +__attribute__ ((noinline, noclone)) > +vmul512 (__m512h a, __m512h b) > +{ > + return a * b; > +} > + > +__m512h > +__attribute__ ((noinline, noclone)) > +vdiv512 (__m512h a, __m512h b) > +{ > + return a / b; > +} > + > +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c > new file mode 100644 > index 00000000000..fc105152d2f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c > @@ -0,0 +1,75 @@ > +/* { dg-do run { target avx512fp16 } } */ > +/* { dg-options "-O2 -mavx512fp16" } */ > + > +#include <string.h> > +#include <stdlib.h> > +static void do_test (void); > + > +#define DO_TEST do_test > +#define AVX512FP16 > +#include "avx512-check.h" > +#include "avx512fp16-11a.c" > + > +/* Get random float16 between -50.x to 50.x. */ > +_Float16 > +get_float16_noround() > +{ > + return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50) > + + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0)); > +} > + > +static void > +do_test (void) > +{ > + _Float16 x[32]; > + _Float16 y[32]; > + _Float16 res_add[32]; > + _Float16 res_sub[32]; > + _Float16 res_mul[32]; > + _Float16 res_div[32]; > + for (int i = 0 ; i != 32; i++) > + { > + x[i] = get_float16_noround (); > + y[i] = get_float16_noround (); > + if (y[i] == 0) > + y[i] = 1.0f; > + res_add[i] = x[i] + y[i]; > + res_sub[i] = x[i] - y[i]; > + res_mul[i] = x[i] * y[i]; > + res_div[i] = x[i] / y[i]; > + > + } > + > + union512h u512 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], > + x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15], > + x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23], > + x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31] }; > + union512h u512_1 = {y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7], > + y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15], > + y[16], y[17], y[18], y[19], y[20], y[21], y[22], y[23], > + y[24], y[25], y[26], y[27], y[28], y[29], y[30], y[31] }; > + > + __m512h v512; > + union512h a512; > + > + memset (&v512, -1, sizeof (v512)); > + v512 = vadd512 (u512.x, u512_1.x); > + a512.x = v512; > + if (check_union512h (a512, res_add)) > + abort (); > + memset (&v512, -1, sizeof (v512)); > + v512 = vsub512 (u512.x, u512_1.x); > + a512.x = v512; > + if (check_union512h (a512, res_sub)) > + abort (); > + memset (&v512, -1, sizeof (v512)); > + v512 = vmul512 (u512.x, u512_1.x); > + a512.x = v512; > + if (check_union512h (a512, res_mul)) > + abort (); > + memset (&v512, -1, sizeof (v512)); > + v512 = vdiv512 (u512.x, u512_1.x); > + a512.x = v512; > + if (check_union512h (a512, res_div)) > + abort (); > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c > new file mode 100644 > index 00000000000..a8c6296f504 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c > @@ -0,0 +1,68 @@ > +/* { dg-do compile} */ > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ > + > +#include <immintrin.h> > +__m128h > +__attribute__ ((noinline, noclone)) > +vadd128 (__m128h a, __m128h b) > +{ > + return a + b; > +} > + > +__m256h > +__attribute__ ((noinline, noclone)) > +vadd256 (__m256h a, __m256h b) > +{ > + return a + b; > +} > + > +__m128h > +__attribute__ ((noinline, noclone)) > +vsub128 (__m128h a, __m128h b) > +{ > + return a - b; > +} > + > +__m256h > +__attribute__ ((noinline, noclone)) > +vsub256 (__m256h a, __m256h b) > +{ > + return a - b; > +} > + > +__m128h > +__attribute__ ((noinline, noclone)) > +vmul128 (__m128h a, __m128h b) > +{ > + return a * b; > +} > + > +__m256h > +__attribute__ ((noinline, noclone)) > +vmul256 (__m256h a, __m256h b) > +{ > + return a * b; > +} > + > +__m128h > +__attribute__ ((noinline, noclone)) > +vdiv128 (__m128h a, __m128h b) > +{ > + return a / b; > +} > + > +__m256h > +__attribute__ ((noinline, noclone)) > +vdiv256 (__m256h a, __m256h b) > +{ > + return a / b; > +} > + > +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c > new file mode 100644 > index 00000000000..b8d3e8a4e96 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c > @@ -0,0 +1,96 @@ > +/* { dg-do run { target avx512fp16 } } */ > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ > + > +#include <string.h> > +#include <stdlib.h> > +static void do_test (void); > + > +#define DO_TEST do_test > +#define AVX512FP16 > +#include "avx512-check.h" > +#include "avx512vlfp16-11a.c" > + > +/* Get random float16 between -50.x to 50.x. */ > +_Float16 > +get_float16_noround() > +{ > + return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50) > + + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0)); > +} > + > +static void > +do_test (void) > +{ > + _Float16 x[16]; > + _Float16 y[16]; > + _Float16 res_add[16]; > + _Float16 res_sub[16]; > + _Float16 res_mul[16]; > + _Float16 res_div[16]; > + for (int i = 0 ; i != 16; i++) > + { > + x[i] = get_float16_noround (); > + y[i] = get_float16_noround (); > + if (y[i] == 0) > + y[i] = 1.0f; > + res_add[i] = x[i] + y[i]; > + res_sub[i] = x[i] - y[i]; > + res_mul[i] = x[i] * y[i]; > + res_div[i] = x[i] / y[i]; > + > + } > + > + union128h u128 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7] }; > + union128h u128_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7] }; > + union256h u256 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], > + x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] }; > + union256h u256_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7], > + y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15]}; > + > + __m128h v128; > + __m256h v256; > + union128h a128; > + union256h a256; > + > + memset (&v128, -1, sizeof (v128)); > + v128 = vadd128 (u128.x, u128_1.x); > + a128.x = v128; > + if (check_union128h (a128, res_add)) > + abort (); > + memset (&v128, -1, sizeof (v128)); > + v128 = vsub128 (u128.x, u128_1.x); > + a128.x = v128; > + if (check_union128h (a128, res_sub)) > + abort (); > + memset (&v128, -1, sizeof (v128)); > + v128 = vmul128 (u128.x, u128_1.x); > + a128.x = v128; > + if (check_union128h (a128, res_mul)) > + abort (); > + memset (&v128, -1, sizeof (v128)); > + v128 = vdiv128 (u128.x, u128_1.x); > + a128.x = v128; > + if (check_union128h (a128, res_div)) > + abort (); > + > + memset (&v256, -1, sizeof (v256)); > + v256 = vadd256 (u256.x, u256_1.x); > + a256.x = v256; > + if (check_union256h (a256, res_add)) > + abort (); > + memset (&v256, -1, sizeof (v256)); > + v256 = vsub256 (u256.x, u256_1.x); > + a256.x = v256; > + if (check_union256h (a256, res_sub)) > + abort (); > + memset (&v256, -1, sizeof (v256)); > + v256 = vmul256 (u256.x, u256_1.x); > + a256.x = v256; > + if (check_union256h (a256, res_mul)) > + abort (); > + memset (&v256, -1, sizeof (v256)); > + v256 = vdiv256 (u256.x, u256_1.x); > + a256.x = v256; > + if (check_union256h (a256, res_div)) > + abort (); > +} > diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c > index f5f5c113612..50ed74cd6d6 100644 > --- a/gcc/testsuite/gcc.target/i386/sse-13.c > +++ b/gcc/testsuite/gcc.target/i386/sse-13.c > @@ -702,6 +702,12 @@ > #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1) > #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E) __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E) > > +/* avx512fp16intrin.h */ > +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8) > + > /* vpclmulqdqintrin.h */ > #define __builtin_ia32_vpclmulqdq_v4di(A, B, C) __builtin_ia32_vpclmulqdq_v4di(A, B, 1) > #define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1) > diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c > index 747d504cedb..26a5e94c7ca 100644 > --- a/gcc/testsuite/gcc.target/i386/sse-14.c > +++ b/gcc/testsuite/gcc.target/i386/sse-14.c > @@ -667,6 +667,20 @@ test_3 (_mm512_mask_rcp28_round_ps, __m512, __m512, __mmask16, __m512, 8) > test_3 (_mm512_mask_rsqrt28_round_pd, __m512d, __m512d, __mmask8, __m512d, 8) > test_3 (_mm512_mask_rsqrt28_round_ps, __m512, __m512, __mmask16, __m512, 8) > > +/* avx512fp16intrin.h */ > +test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8) > +test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8) > +test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8) > +test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8) > +test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) > +test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) > +test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) > +test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) > +test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) > +test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) > +test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) > +test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) > + > /* shaintrin.h */ > test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1) > > diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c > index 33411969901..8d25effd724 100644 > --- a/gcc/testsuite/gcc.target/i386/sse-22.c > +++ b/gcc/testsuite/gcc.target/i386/sse-22.c > @@ -772,6 +772,20 @@ test_2 (_mm_rcp28_round_ss, __m128, __m128, __m128, 8) > test_2 (_mm_rsqrt28_round_sd, __m128d, __m128d, __m128d, 8) > test_2 (_mm_rsqrt28_round_ss, __m128, __m128, __m128, 8) > > +/* avx512fp16intrin.h */ > +test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8) > +test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8) > +test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8) > +test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8) > +test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) > +test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) > +test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) > +test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) > +test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) > +test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) > +test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) > +test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) > + > /* shaintrin.h */ > test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1) > > diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c > index 86590ca5ffb..f7dd5d7495c 100644 > --- a/gcc/testsuite/gcc.target/i386/sse-23.c > +++ b/gcc/testsuite/gcc.target/i386/sse-23.c > @@ -703,6 +703,12 @@ > #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1) > #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E) __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E) > > +/* avx512fp16intrin.h */ > +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8) > +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8) > + > /* vpclmulqdqintrin.h */ > #define __builtin_ia32_vpclmulqdq_v4di(A, B, C) __builtin_ia32_vpclmulqdq_v4di(A, B, 1) > #define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1) > -- > 2.18.1 > -- BR, Hongtao
diff --git a/gcc/config.gcc b/gcc/config.gcc index 5b4f894185a..d64a8b9407e 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -416,7 +416,7 @@ i[34567]86-*-* | x86_64-*-*) tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h amxbf16intrin.h x86gprintrin.h uintrintrin.h hresetintrin.h keylockerintrin.h avxvnniintrin.h - mwaitintrin.h avx512fp16intrin.h" + mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h" ;; ia64-*-*) extra_headers=ia64intrin.h diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 3fc0770986e..3e9d676dc39 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -217,6 +217,257 @@ _mm_store_sh (void *__P, __m128h __A) *(_Float16 *) __P = ((__v8hf)__A)[0]; } +/* Intrinsics v[add,sub,mul,div]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A + (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vaddph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A - (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vsubph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A * (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vmulph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A / (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vdivph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} +#else +#define _mm512_add_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B), \ + _mm512_setzero_ph (),\ + (__mmask32)-1, (C))) + +#define _mm512_mask_add_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_add_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C), \ + _mm512_setzero_ph (),\ + (A), (D))) + +#define _mm512_sub_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B), \ + _mm512_setzero_ph (),\ + (__mmask32)-1, (C))) + +#define _mm512_mask_sub_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_sub_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C), \ + _mm512_setzero_ph (),\ + (A), (D))) + +#define _mm512_mul_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B), \ + _mm512_setzero_ph (),\ + (__mmask32)-1, (C))) + +#define _mm512_mask_mul_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_mul_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C), \ + _mm512_setzero_ph (),\ + (A), (D))) + +#define _mm512_div_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B), \ + _mm512_setzero_ph (),\ + (__mmask32)-1, (C))) + +#define _mm512_mask_div_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_div_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C), \ + _mm512_setzero_ph (),\ + (A), (D))) +#endif /* __OPTIMIZE__ */ + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h new file mode 100644 index 00000000000..75fa9eb29e7 --- /dev/null +++ b/gcc/config/i386/avx512fp16vlintrin.h @@ -0,0 +1,219 @@ +/* Copyright (C) 2019 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <avx512fp16vlintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512FP16VLINTRIN_H_INCLUDED +#define __AVX512FP16VLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512FP16__) +#pragma GCC push_options +#pragma GCC target("avx512fp16,avx512vl") +#define __DISABLE_AVX512FP16VL__ +#endif /* __AVX512FP16VL__ */ + +/* Intrinsics v[add,sub,mul,div]ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A + (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A + (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vaddph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A - (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A - (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vsubph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A * (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A * (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vmulph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A / (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A / (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vdivph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +#ifdef __DISABLE_AVX512FP16VL__ +#undef __DISABLE_AVX512FP16VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512FP16VL__ */ + +#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index eb5153002ae..ee3b8c30589 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -98,6 +98,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI) # AVX vectors DEF_VECTOR_TYPE (V4DF, DOUBLE) DEF_VECTOR_TYPE (V8SF, FLOAT) +DEF_VECTOR_TYPE (V16HF, FLOAT16) DEF_VECTOR_TYPE (V4DI, DI) DEF_VECTOR_TYPE (V8SI, SI) DEF_VECTOR_TYPE (V16HI, HI) @@ -108,6 +109,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI) # AVX512F vectors DEF_VECTOR_TYPE (V32SF, FLOAT) +DEF_VECTOR_TYPE (V32HF, FLOAT16) DEF_VECTOR_TYPE (V16SF, FLOAT) DEF_VECTOR_TYPE (V8DF, DOUBLE) DEF_VECTOR_TYPE (V8DI, DI) @@ -1302,3 +1304,8 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) # FP16 builtins DEF_FUNCTION_TYPE (V8HF, V8HI) +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI) +DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 1cc0cc6968c..b783d266dd8 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2774,6 +2774,20 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builti BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) +/* AVX512FP16. */ +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) + /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) @@ -2973,6 +2987,12 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fixuns_truncv8dfv8di2_mask_round, " BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "__builtin_ia32_rangeps512_mask", IX86_BUILTIN_RANGEPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT) BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT) +/* AVX512FP16. */ +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) + BDESC_END (ROUND_ARGS, MULTI_ARG) /* FMA4 and XOP. */ diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 5ce7163b241..39647eb2cf1 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -9760,6 +9760,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI: case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI: case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI: + case V32HF_FTYPE_V32HF_V32HF_V32HF_USI: case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI: case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI: case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI: @@ -9777,6 +9778,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI: case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI: case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI: + case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI: case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI: case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI: case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI: @@ -9784,6 +9786,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI: case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI: case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI: + case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI: case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI: case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI: case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI: @@ -10460,6 +10463,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, case INT_FTYPE_V4SF_INT: nargs = 2; break; + case V32HF_FTYPE_V32HF_V32HF_INT: case V4SF_FTYPE_V4SF_UINT_INT: case V4SF_FTYPE_V4SF_UINT64_INT: case V2DF_FTYPE_V2DF_UINT64_INT: @@ -10500,6 +10504,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: + case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT: diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index 5344e22c9c8..e08efb9dff3 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -96,6 +96,8 @@ #include <avx512fp16intrin.h> +#include <avx512fp16vlintrin.h> + #include <shaintrin.h> #include <fmaintrin.h> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1009d656cbb..2c1b6fbcd86 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -295,6 +295,13 @@ (define_mode_iterator VF [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) +(define_mode_iterator VFH + [(V32HF "TARGET_AVX512FP16") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) + ;; 128- and 256-bit float vector modes (define_mode_iterator VF_128_256 [(V8SF "TARGET_AVX") V4SF @@ -318,6 +325,13 @@ (define_mode_iterator VF1_128_256VL (define_mode_iterator VF2 [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) +;; All DFmode & HFmode vector float modes +(define_mode_iterator VF2H + [(V32HF "TARGET_AVX512FP16") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) + ;; 128- and 256-bit DF vector modes (define_mode_iterator VF2_128_256 [(V4DF "TARGET_AVX") V2DF]) @@ -824,6 +838,7 @@ (define_mode_attr avx512fmaskmode (V32HI "SI") (V16HI "HI") (V8HI "QI") (V4HI "QI") (V16SI "HI") (V8SI "QI") (V4SI "QI") (V8DI "QI") (V4DI "QI") (V2DI "QI") + (V32HF "SI") (V16HF "HI") (V8HF "QI") (V16SF "HI") (V8SF "QI") (V4SF "QI") (V8DF "QI") (V4DF "QI") (V2DF "QI")]) @@ -842,6 +857,7 @@ (define_mode_attr avx512fmaskhalfmode (V32HI "HI") (V16HI "QI") (V8HI "QI") (V4HI "QI") (V16SI "QI") (V8SI "QI") (V4SI "QI") (V8DI "QI") (V4DI "QI") (V2DI "QI") + (V32HF "HI") (V16HF "QI") (V8HF "QI") (V16SF "QI") (V8SF "QI") (V4SF "QI") (V8DF "QI") (V4DF "QI") (V2DF "QI")]) @@ -1940,18 +1956,18 @@ (define_insn_and_split "*nabs<mode>2" [(set_attr "isa" "noavx,noavx,avx,avx")]) (define_expand "<insn><mode>3<mask_name><round_name>" - [(set (match_operand:VF 0 "register_operand") - (plusminus:VF - (match_operand:VF 1 "<round_nimm_predicate>") - (match_operand:VF 2 "<round_nimm_predicate>")))] + [(set (match_operand:VFH 0 "register_operand") + (plusminus:VFH + (match_operand:VFH 1 "<round_nimm_predicate>") + (match_operand:VFH 2 "<round_nimm_predicate>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") (define_insn "*<insn><mode>3<mask_name><round_name>" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (plusminus:VF - (match_operand:VF 1 "<bcst_round_nimm_predicate>" "<comm>0,v") - (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (plusminus:VFH + (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "<comm>0,v") + (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ @@ -2002,18 +2018,18 @@ (define_insn "<sse>_vm<insn><mode>3<mask_scalar_name><round_scalar_name>" (set_attr "mode" "<ssescalarmode>")]) (define_expand "mul<mode>3<mask_name><round_name>" - [(set (match_operand:VF 0 "register_operand") - (mult:VF - (match_operand:VF 1 "<round_nimm_predicate>") - (match_operand:VF 2 "<round_nimm_predicate>")))] + [(set (match_operand:VFH 0 "register_operand") + (mult:VFH + (match_operand:VFH 1 "<round_nimm_predicate>") + (match_operand:VFH 2 "<round_nimm_predicate>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);") (define_insn "*mul<mode>3<mask_name><round_name>" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (mult:VF - (match_operand:VF 1 "<bcst_round_nimm_predicate>" "%0,v") - (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (mult:VFH + (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "%0,v") + (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands) && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ @@ -2067,9 +2083,9 @@ (define_insn "<sse>_vm<multdiv_mnemonic><mode>3<mask_scalar_name><round_scalar_n (set_attr "mode" "<ssescalarmode>")]) (define_expand "div<mode>3" - [(set (match_operand:VF2 0 "register_operand") - (div:VF2 (match_operand:VF2 1 "register_operand") - (match_operand:VF2 2 "vector_operand")))] + [(set (match_operand:VF2H 0 "register_operand") + (div:VF2H (match_operand:VF2H 1 "register_operand") + (match_operand:VF2H 2 "vector_operand")))] "TARGET_SSE2") (define_expand "div<mode>3" @@ -2090,10 +2106,10 @@ (define_expand "div<mode>3" }) (define_insn "<sse>_div<mode>3<mask_name><round_name>" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (div:VF - (match_operand:VF 1 "register_operand" "0,v") - (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (div:VFH + (match_operand:VFH 1 "register_operand" "0,v") + (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ div<ssemodesuffix>\t{%2, %0|%0, %2} diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 477a89803fa..762383bfd11 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -24,6 +24,7 @@ (define_mode_iterator SUBST_V V32HI V16HI V8HI V16SI V8SI V4SI V8DI V4DI V2DI + V32HF V16HF V8HF V16SF V8SF V4SF V8DF V4DF V2DF]) @@ -35,6 +36,7 @@ (define_mode_iterator SUBST_A V32HI V16HI V8HI V16SI V8SI V4SI V8DI V4DI V2DI + V32HF V16HF V8HF V16SF V8SF V4SF V8DF V4DF V2DF QI HI SI DI SF DF]) @@ -142,7 +144,9 @@ (define_subst_attr "round_prefix" "round" "vex" "evex") (define_subst_attr "round_mode512bit_condition" "round" "1" "(<MODE>mode == V16SFmode || <MODE>mode == V8DFmode || <MODE>mode == V8DImode - || <MODE>mode == V16SImode)") + || <MODE>mode == V16SImode + || <MODE>mode == V32HFmode)") + (define_subst_attr "round_modev8sf_condition" "round" "1" "(<MODE>mode == V8SFmode)") (define_subst_attr "round_modev4sf_condition" "round" "1" "(<MODE>mode == V4SFmode)") (define_subst_attr "round_codefor" "round" "*" "") diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c index f3676077743..1eaee861141 100644 --- a/gcc/testsuite/gcc.target/i386/avx-1.c +++ b/gcc/testsuite/gcc.target/i386/avx-1.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16" } */ +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16 -mavx512vl" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> @@ -685,6 +685,12 @@ #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1) #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E) __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E) +/* avx512fp16intrin.h */ +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8) + /* vpclmulqdqintrin.h */ #define __builtin_ia32_vpclmulqdq_v4di(A, B, C) __builtin_ia32_vpclmulqdq_v4di(A, B, 1) #define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1) diff --git a/gcc/testsuite/gcc.target/i386/avx-2.c b/gcc/testsuite/gcc.target/i386/avx-2.c index 1751c52565c..642ae4d7bfb 100644 --- a/gcc/testsuite/gcc.target/i386/avx-2.c +++ b/gcc/testsuite/gcc.target/i386/avx-2.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16" } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16 -mavx512vl" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c new file mode 100644 index 00000000000..28492fa3f7b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c @@ -0,0 +1,36 @@ +/* { dg-do compile} */ +/* { dg-options "-O2 -mavx512fp16" } */ + +#include <immintrin.h> +__m512h +__attribute__ ((noinline, noclone)) +vadd512 (__m512h a, __m512h b) +{ + return a + b; +} + +__m512h +__attribute__ ((noinline, noclone)) +vsub512 (__m512h a, __m512h b) +{ + return a - b; +} + +__m512h +__attribute__ ((noinline, noclone)) +vmul512 (__m512h a, __m512h b) +{ + return a * b; +} + +__m512h +__attribute__ ((noinline, noclone)) +vdiv512 (__m512h a, __m512h b) +{ + return a / b; +} + +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c new file mode 100644 index 00000000000..fc105152d2f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c @@ -0,0 +1,75 @@ +/* { dg-do run { target avx512fp16 } } */ +/* { dg-options "-O2 -mavx512fp16" } */ + +#include <string.h> +#include <stdlib.h> +static void do_test (void); + +#define DO_TEST do_test +#define AVX512FP16 +#include "avx512-check.h" +#include "avx512fp16-11a.c" + +/* Get random float16 between -50.x to 50.x. */ +_Float16 +get_float16_noround() +{ + return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50) + + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0)); +} + +static void +do_test (void) +{ + _Float16 x[32]; + _Float16 y[32]; + _Float16 res_add[32]; + _Float16 res_sub[32]; + _Float16 res_mul[32]; + _Float16 res_div[32]; + for (int i = 0 ; i != 32; i++) + { + x[i] = get_float16_noround (); + y[i] = get_float16_noround (); + if (y[i] == 0) + y[i] = 1.0f; + res_add[i] = x[i] + y[i]; + res_sub[i] = x[i] - y[i]; + res_mul[i] = x[i] * y[i]; + res_div[i] = x[i] / y[i]; + + } + + union512h u512 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], + x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15], + x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23], + x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31] }; + union512h u512_1 = {y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7], + y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15], + y[16], y[17], y[18], y[19], y[20], y[21], y[22], y[23], + y[24], y[25], y[26], y[27], y[28], y[29], y[30], y[31] }; + + __m512h v512; + union512h a512; + + memset (&v512, -1, sizeof (v512)); + v512 = vadd512 (u512.x, u512_1.x); + a512.x = v512; + if (check_union512h (a512, res_add)) + abort (); + memset (&v512, -1, sizeof (v512)); + v512 = vsub512 (u512.x, u512_1.x); + a512.x = v512; + if (check_union512h (a512, res_sub)) + abort (); + memset (&v512, -1, sizeof (v512)); + v512 = vmul512 (u512.x, u512_1.x); + a512.x = v512; + if (check_union512h (a512, res_mul)) + abort (); + memset (&v512, -1, sizeof (v512)); + v512 = vdiv512 (u512.x, u512_1.x); + a512.x = v512; + if (check_union512h (a512, res_div)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c new file mode 100644 index 00000000000..a8c6296f504 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c @@ -0,0 +1,68 @@ +/* { dg-do compile} */ +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ + +#include <immintrin.h> +__m128h +__attribute__ ((noinline, noclone)) +vadd128 (__m128h a, __m128h b) +{ + return a + b; +} + +__m256h +__attribute__ ((noinline, noclone)) +vadd256 (__m256h a, __m256h b) +{ + return a + b; +} + +__m128h +__attribute__ ((noinline, noclone)) +vsub128 (__m128h a, __m128h b) +{ + return a - b; +} + +__m256h +__attribute__ ((noinline, noclone)) +vsub256 (__m256h a, __m256h b) +{ + return a - b; +} + +__m128h +__attribute__ ((noinline, noclone)) +vmul128 (__m128h a, __m128h b) +{ + return a * b; +} + +__m256h +__attribute__ ((noinline, noclone)) +vmul256 (__m256h a, __m256h b) +{ + return a * b; +} + +__m128h +__attribute__ ((noinline, noclone)) +vdiv128 (__m128h a, __m128h b) +{ + return a / b; +} + +__m256h +__attribute__ ((noinline, noclone)) +vdiv256 (__m256h a, __m256h b) +{ + return a / b; +} + +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */ +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c new file mode 100644 index 00000000000..b8d3e8a4e96 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c @@ -0,0 +1,96 @@ +/* { dg-do run { target avx512fp16 } } */ +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ + +#include <string.h> +#include <stdlib.h> +static void do_test (void); + +#define DO_TEST do_test +#define AVX512FP16 +#include "avx512-check.h" +#include "avx512vlfp16-11a.c" + +/* Get random float16 between -50.x to 50.x. */ +_Float16 +get_float16_noround() +{ + return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50) + + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0)); +} + +static void +do_test (void) +{ + _Float16 x[16]; + _Float16 y[16]; + _Float16 res_add[16]; + _Float16 res_sub[16]; + _Float16 res_mul[16]; + _Float16 res_div[16]; + for (int i = 0 ; i != 16; i++) + { + x[i] = get_float16_noround (); + y[i] = get_float16_noround (); + if (y[i] == 0) + y[i] = 1.0f; + res_add[i] = x[i] + y[i]; + res_sub[i] = x[i] - y[i]; + res_mul[i] = x[i] * y[i]; + res_div[i] = x[i] / y[i]; + + } + + union128h u128 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7] }; + union128h u128_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7] }; + union256h u256 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], + x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] }; + union256h u256_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7], + y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15]}; + + __m128h v128; + __m256h v256; + union128h a128; + union256h a256; + + memset (&v128, -1, sizeof (v128)); + v128 = vadd128 (u128.x, u128_1.x); + a128.x = v128; + if (check_union128h (a128, res_add)) + abort (); + memset (&v128, -1, sizeof (v128)); + v128 = vsub128 (u128.x, u128_1.x); + a128.x = v128; + if (check_union128h (a128, res_sub)) + abort (); + memset (&v128, -1, sizeof (v128)); + v128 = vmul128 (u128.x, u128_1.x); + a128.x = v128; + if (check_union128h (a128, res_mul)) + abort (); + memset (&v128, -1, sizeof (v128)); + v128 = vdiv128 (u128.x, u128_1.x); + a128.x = v128; + if (check_union128h (a128, res_div)) + abort (); + + memset (&v256, -1, sizeof (v256)); + v256 = vadd256 (u256.x, u256_1.x); + a256.x = v256; + if (check_union256h (a256, res_add)) + abort (); + memset (&v256, -1, sizeof (v256)); + v256 = vsub256 (u256.x, u256_1.x); + a256.x = v256; + if (check_union256h (a256, res_sub)) + abort (); + memset (&v256, -1, sizeof (v256)); + v256 = vmul256 (u256.x, u256_1.x); + a256.x = v256; + if (check_union256h (a256, res_mul)) + abort (); + memset (&v256, -1, sizeof (v256)); + v256 = vdiv256 (u256.x, u256_1.x); + a256.x = v256; + if (check_union256h (a256, res_div)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c index f5f5c113612..50ed74cd6d6 100644 --- a/gcc/testsuite/gcc.target/i386/sse-13.c +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -702,6 +702,12 @@ #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1) #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E) __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E) +/* avx512fp16intrin.h */ +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8) + /* vpclmulqdqintrin.h */ #define __builtin_ia32_vpclmulqdq_v4di(A, B, C) __builtin_ia32_vpclmulqdq_v4di(A, B, 1) #define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c index 747d504cedb..26a5e94c7ca 100644 --- a/gcc/testsuite/gcc.target/i386/sse-14.c +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -667,6 +667,20 @@ test_3 (_mm512_mask_rcp28_round_ps, __m512, __m512, __mmask16, __m512, 8) test_3 (_mm512_mask_rsqrt28_round_pd, __m512d, __m512d, __mmask8, __m512d, 8) test_3 (_mm512_mask_rsqrt28_round_ps, __m512, __m512, __mmask16, __m512, 8) +/* avx512fp16intrin.h */ +test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8) +test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8) +test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8) +test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8) +test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) +test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) +test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) +test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) +test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) +test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) +test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) +test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) + /* shaintrin.h */ test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c index 33411969901..8d25effd724 100644 --- a/gcc/testsuite/gcc.target/i386/sse-22.c +++ b/gcc/testsuite/gcc.target/i386/sse-22.c @@ -772,6 +772,20 @@ test_2 (_mm_rcp28_round_ss, __m128, __m128, __m128, 8) test_2 (_mm_rsqrt28_round_sd, __m128d, __m128d, __m128d, 8) test_2 (_mm_rsqrt28_round_ss, __m128, __m128, __m128, 8) +/* avx512fp16intrin.h */ +test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8) +test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8) +test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8) +test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8) +test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) +test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) +test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) +test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8) +test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) +test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) +test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) +test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8) + /* shaintrin.h */ test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c index 86590ca5ffb..f7dd5d7495c 100644 --- a/gcc/testsuite/gcc.target/i386/sse-23.c +++ b/gcc/testsuite/gcc.target/i386/sse-23.c @@ -703,6 +703,12 @@ #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1) #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E) __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E) +/* avx512fp16intrin.h */ +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8) +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8) + /* vpclmulqdqintrin.h */ #define __builtin_ia32_vpclmulqdq_v4di(A, B, C) __builtin_ia32_vpclmulqdq_v4di(A, B, 1) #define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1)