Message ID | 20211013090659.43060-1-hongyu.wang@intel.com |
---|---|
State | New |
Headers | show |
Series | AVX512FP16: Adjust builtin for mask complex fma | expand |
On Wed, Oct 13, 2021 at 5:07 PM Hongyu Wang via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi, > > Current mask/mask3 implementation for complex fma contains > duplicated parameter in macro, which may cause error at -O0. > Refactor macro implementation to builtins to avoid potential > error. > > For round intrinsic with NO_ROUND as input, ix86_erase_embedded_rounding > erases embedded_rounding upspec but could break other emit_insn in > expanders. Skip those expanders with multiple emit_insn for this > function and check rounding in expander with subst. > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}. > OK for master? Ok. > > gcc/ChangeLog: > > * config/i386/avx512fp16intrin.h (_mm512_mask_fcmadd_pch): > Adjust builtin call. > (_mm512_mask3_fcmadd_pch): Likewise. > (_mm512_mask_fmadd_pch): Likewise > (_mm512_mask3_fmadd_pch): Likewise > (_mm512_mask_fcmadd_round_pch): Likewise > (_mm512_mask3_fcmadd_round_pch): Likewise > (_mm512_mask_fmadd_round_pch): Likewise > (_mm512_mask3_fmadd_round_pch): Likewise > (_mm_mask_fcmadd_sch): Likewise > (_mm_mask3_fcmadd_sch): Likewise > (_mm_mask_fmadd_sch): Likewise > (_mm_mask3_fmadd_sch): Likewise > (_mm_mask_fcmadd_round_sch): Likewise > (_mm_mask3_fcmadd_round_sch): Likewise > (_mm_mask_fmadd_round_sch): Likewise > (_mm_mask3_fmadd_round_sch): Likewise > (_mm_fcmadd_round_sch): Likewise > * config/i386/avx512fp16vlintrin.h (_mm_mask_fmadd_pch): > Adjust builtin call. > (_mm_mask3_fmadd_pch): Likewise > (_mm256_mask_fmadd_pch): Likewise > (_mm256_mask3_fmadd_pch): Likewise > (_mm_mask_fcmadd_pch): Likewise > (_mm_mask3_fcmadd_pch): Likewise > (_mm256_mask_fcmadd_pch): Likewise > (_mm256_mask3_fcmadd_pch): Likewise > * config/i386/i386-builtin.def: Add mask3 builtin for complex > fma, and adjust mask_builtin to corresponding expander. > * config/i386/i386-expand.c (ix86_expand_round_builtin): > Skip eraseing embedded rounding for expanders that emits > multiple insns. > * config/i386/sse.md (complexmove): New mode_attr. > (<avx512>_fmaddc_<mode>_mask1<round_expand_name>): New expander. > (<avx512>_fcmaddc_<mode>_mask1<round_expand_name>): Likewise. > (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>): Likewise. > (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>): Likewise. > (avx512fp16_fcmaddcsh_v8hf_mask3<round_expand_name>): Likewise. > (avx512fp16_fmaddcsh_v8hf_mask3<round_expand_name>): Likewise. > * config/i386/subst.md (round_embedded_complex): New subst. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx-1.c: Add new mask3 builtins. > * gcc.target/i386/sse-13.c: Ditto. > * gcc.target/i386/sse-23.c: Ditto. > * gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Add scanning for > mask/mask3 intrinsic. > * gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto. > * gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: New test for > -mavx512vl. > * gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto. > --- > gcc/config/i386/avx512fp16intrin.h | 261 ++++++------------ > gcc/config/i386/avx512fp16vlintrin.h | 56 ++-- > gcc/config/i386/i386-builtin.def | 24 +- > gcc/config/i386/i386-expand.c | 22 +- > gcc/config/i386/sse.md | 183 ++++++++++++ > gcc/config/i386/subst.md | 3 + > gcc/testsuite/gcc.target/i386/avx-1.c | 4 + > .../i386/avx512fp16-vfcmaddcsh-1a.c | 2 + > .../i386/avx512fp16-vfcmaddcsh-1c.c | 13 + > .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c | 2 + > .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c | 13 + > gcc/testsuite/gcc.target/i386/sse-13.c | 4 + > gcc/testsuite/gcc.target/i386/sse-23.c | 4 + > 13 files changed, 375 insertions(+), 216 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > > diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h > index 29cf6792335..5e49447a020 100644 > --- a/gcc/config/i386/avx512fp16intrin.h > +++ b/gcc/config/i386/avx512fp16intrin.h > @@ -6258,13 +6258,11 @@ extern __inline __m512h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) > { > - return (__m512h) __builtin_ia32_movaps512_mask > - ((__v16sf) > - __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, > - (__v32hf) __C, > - (__v32hf) __D, __B, > - _MM_FROUND_CUR_DIRECTION), > - (__v16sf) __A, __B); > + return (__m512h) > + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, > + (__v32hf) __C, > + (__v32hf) __D, __B, > + _MM_FROUND_CUR_DIRECTION); > } > > extern __inline __m512h > @@ -6272,10 +6270,10 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_mask3_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D) > { > return (__m512h) > - __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, > - (__v32hf) __B, > - (__v32hf) __C, > - __D, _MM_FROUND_CUR_DIRECTION); > + __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A, > + (__v32hf) __B, > + (__v32hf) __C, > + __D, _MM_FROUND_CUR_DIRECTION); > } > > extern __inline __m512h > @@ -6304,13 +6302,11 @@ extern __inline __m512h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_mask_fmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) > { > - return (__m512h) __builtin_ia32_movaps512_mask > - ((__v16sf) > - __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, > - (__v32hf) __C, > - (__v32hf) __D, __B, > - _MM_FROUND_CUR_DIRECTION), > - (__v16sf) __A, __B); > + return (__m512h) > + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, > + (__v32hf) __C, > + (__v32hf) __D, __B, > + _MM_FROUND_CUR_DIRECTION); > } > > extern __inline __m512h > @@ -6318,10 +6314,10 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_mask3_fmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D) > { > return (__m512h) > - __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, > - (__v32hf) __B, > - (__v32hf) __C, > - __D, _MM_FROUND_CUR_DIRECTION); > + __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A, > + (__v32hf) __B, > + (__v32hf) __C, > + __D, _MM_FROUND_CUR_DIRECTION); > } > > extern __inline __m512h > @@ -6352,13 +6348,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_mask_fcmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C, > __m512h __D, const int __E) > { > - return (__m512h) __builtin_ia32_movaps512_mask > - ((__v16sf) > - __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, > - (__v32hf) __C, > - (__v32hf) __D, __B, > - __E), > - (__v16sf) __A, __B); > + return (__m512h) > + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, > + (__v32hf) __C, > + (__v32hf) __D, __B, > + __E); > } > > extern __inline __m512h > @@ -6367,10 +6361,10 @@ _mm512_mask3_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, > __mmask16 __D, const int __E) > { > return (__m512h) > - __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, > - (__v32hf) __B, > - (__v32hf) __C, > - __D, __E); > + __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A, > + (__v32hf) __B, > + (__v32hf) __C, > + __D, __E); > } > > extern __inline __m512h > @@ -6401,13 +6395,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_mask_fmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C, > __m512h __D, const int __E) > { > - return (__m512h) __builtin_ia32_movaps512_mask > - ((__v16sf) > - __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, > - (__v32hf) __C, > - (__v32hf) __D, __B, > - __E), > - (__v16sf) __A, __B); > + return (__m512h) > + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, > + (__v32hf) __C, > + (__v32hf) __D, __B, > + __E); > } > > extern __inline __m512h > @@ -6416,10 +6408,10 @@ _mm512_mask3_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, > __mmask16 __D, const int __E) > { > return (__m512h) > - __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, > - (__v32hf) __B, > - (__v32hf) __C, > - __D, __E); > + __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A, > + (__v32hf) __B, > + (__v32hf) __C, > + __D, __E); > } > > extern __inline __m512h > @@ -6439,18 +6431,16 @@ _mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C, > (__m512h) __builtin_ia32_vfcmaddcph512_round ((A), (B), (C), (D)) > > #define _mm512_mask_fcmadd_round_pch(A, B, C, D, E) \ > - ((__m512h) __builtin_ia32_movaps512_mask ( \ > - (__v16sf) \ > + ((__m512h) \ > __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) (A), \ > (__v32hf) (C), \ > (__v32hf) (D), \ > - (B), (E)), \ > - (__v16sf) (A), (B))); > + (B), (E))) > > > #define _mm512_mask3_fcmadd_round_pch(A, B, C, D, E) \ > ((__m512h) \ > - __builtin_ia32_vfcmaddcph512_mask_round ((A), (B), (C), (D), (E))) > + __builtin_ia32_vfcmaddcph512_mask3_round ((A), (B), (C), (D), (E))) > > #define _mm512_maskz_fcmadd_round_pch(A, B, C, D, E) \ > (__m512h) \ > @@ -6460,17 +6450,15 @@ _mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C, > (__m512h) __builtin_ia32_vfmaddcph512_round ((A), (B), (C), (D)) > > #define _mm512_mask_fmadd_round_pch(A, B, C, D, E) \ > - ((__m512h) __builtin_ia32_movaps512_mask ( \ > - (__v16sf) \ > + ((__m512h) \ > __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) (A), \ > (__v32hf) (C), \ > (__v32hf) (D), \ > - (B), (E)), \ > - (__v16sf) (A), (B))); > + (B), (E))) > > #define _mm512_mask3_fmadd_round_pch(A, B, C, D, E) \ > (__m512h) \ > - __builtin_ia32_vfmaddcph512_mask_round ((A), (B), (C), (D), (E)) > + __builtin_ia32_vfmaddcph512_mask3_round ((A), (B), (C), (D), (E)) > > #define _mm512_maskz_fmadd_round_pch(A, B, C, D, E) \ > (__m512h) \ > @@ -6643,35 +6631,22 @@ extern __inline __m128h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask_fcmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > { > -#ifdef __AVX512VL__ > - return (__m128h) __builtin_ia32_movaps128_mask ( > - (__v4sf) > - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, > - (__v8hf) __C, > - (__v8hf) __D, __B, > - _MM_FROUND_CUR_DIRECTION), > - (__v4sf) __A, __B); > -#else > - return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, > - (__v4sf) > + return (__m128h) > __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, > (__v8hf) __C, > (__v8hf) __D, __B, > - _MM_FROUND_CUR_DIRECTION), > - (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); > -#endif > + _MM_FROUND_CUR_DIRECTION); > } > > extern __inline __m128h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask3_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) > { > - return (__m128h) _mm_move_ss ((__m128) __C, > - (__m128) > - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, > - (__v8hf) __B, > - (__v8hf) __C, __D, > - _MM_FROUND_CUR_DIRECTION)); > + return (__m128h) > + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A, > + (__v8hf) __B, > + (__v8hf) __C, __D, > + _MM_FROUND_CUR_DIRECTION); > } > > extern __inline __m128h > @@ -6700,35 +6675,22 @@ extern __inline __m128h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask_fmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > { > -#ifdef __AVX512VL__ > - return (__m128h) __builtin_ia32_movaps128_mask ( > - (__v4sf) > - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, > - (__v8hf) __C, > - (__v8hf) __D, __B, > - _MM_FROUND_CUR_DIRECTION), > - (__v4sf) __A, __B); > -#else > - return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, > - (__v4sf) > + return (__m128h) > __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, > (__v8hf) __C, > (__v8hf) __D, __B, > - _MM_FROUND_CUR_DIRECTION), > - (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); > -#endif > + _MM_FROUND_CUR_DIRECTION); > } > > extern __inline __m128h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask3_fmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) > { > - return (__m128h) _mm_move_ss ((__m128) __C, > - (__m128) > - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, > - (__v8hf) __B, > - (__v8hf) __C, __D, > - _MM_FROUND_CUR_DIRECTION)); > + return (__m128h) > + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A, > + (__v8hf) __B, > + (__v8hf) __C, __D, > + _MM_FROUND_CUR_DIRECTION); > } > > extern __inline __m128h > @@ -6759,23 +6721,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask_fcmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C, > __m128h __D, const int __E) > { > -#ifdef __AVX512VL__ > - return (__m128h) __builtin_ia32_movaps128_mask ( > - (__v4sf) > - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, > - (__v8hf) __C, > - (__v8hf) __D, > - __B, __E), > - (__v4sf) __A, __B); > -#else > - return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, > - (__v4sf) > + return (__m128h) > __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, > (__v8hf) __C, > (__v8hf) __D, > - __B, __E), > - (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); > -#endif > + __B, __E); > } > > extern __inline __m128h > @@ -6783,12 +6733,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask3_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, > __mmask8 __D, const int __E) > { > - return (__m128h) _mm_move_ss ((__m128) __C, > - (__m128) > - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, > - (__v8hf) __B, > - (__v8hf) __C, > - __D, __E)); > + return (__m128h) > + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A, > + (__v8hf) __B, > + (__v8hf) __C, > + __D, __E); > } > > extern __inline __m128h > @@ -6819,23 +6768,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask_fmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C, > __m128h __D, const int __E) > { > -#ifdef __AVX512VL__ > - return (__m128h) __builtin_ia32_movaps128_mask ( > - (__v4sf) > - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, > - (__v8hf) __C, > - (__v8hf) __D, > - __B, __E), > - (__v4sf) __A, __B); > -#else > - return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, > - (__v4sf) > + return (__m128h) > __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, > (__v8hf) __C, > (__v8hf) __D, > - __B, __E), > - (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); > -#endif > + __B, __E); > } > > extern __inline __m128h > @@ -6843,12 +6780,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask3_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, > __mmask8 __D, const int __E) > { > - return (__m128h) _mm_move_ss ((__m128) __C, > - (__m128) > - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, > - (__v8hf) __B, > - (__v8hf) __C, > - __D, __E)); > + return (__m128h) > + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A, > + (__v8hf) __B, > + (__v8hf) __C, > + __D, __E); > } > > extern __inline __m128h > @@ -6874,34 +6810,20 @@ _mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D) > __D); > } > #else > -#ifdef __AVX512VL__ > #define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \ > - ((__m128h) __builtin_ia32_movaps128_mask ( \ > - (__v4sf) \ > + ((__m128h) \ > __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ > (__v8hf) (C), \ > (__v8hf) (D), \ > - (B), (E)), \ > - (__v4sf) (A), (B))) > + (B), (E))) > > -#else > -#define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \ > - ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A), \ > - (__v4sf) \ > - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ > - (__v8hf) (C), \ > - (__v8hf) (D), \ > - (B), (E)), \ > - (__v4sf) _mm_set_ss ((float) ((int) (B) << 31)))) > -#endif > > #define _mm_mask3_fcmadd_round_sch(A, B, C, D, E) \ > - ((__m128h) _mm_move_ss ((__m128) (C), \ > - (__m128) \ > - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ > - (__v8hf) (B), \ > - (__v8hf) (C), \ > - (D), (E)))) > + ((__m128h) \ > + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) (A), \ > + (__v8hf) (B), \ > + (__v8hf) (C), \ > + (D), (E))) > > #define _mm_maskz_fcmadd_round_sch(A, B, C, D, E) \ > __builtin_ia32_vfcmaddcsh_maskz_round ((B), (C), (D), (A), (E)) > @@ -6909,34 +6831,19 @@ _mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D) > #define _mm_fcmadd_round_sch(A, B, C, D) \ > __builtin_ia32_vfcmaddcsh_round ((A), (B), (C), (D)) > > -#ifdef __AVX512VL__ > #define _mm_mask_fmadd_round_sch(A, B, C, D, E) \ > - ((__m128h) __builtin_ia32_movaps128_mask ( \ > - (__v4sf) \ > + ((__m128h) \ > __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ > (__v8hf) (C), \ > (__v8hf) (D), \ > - (B), (E)), \ > - (__v4sf) (A), (B))) > - > -#else > -#define _mm_mask_fmadd_round_sch(A, B, C, D, E) \ > - ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A), \ > - (__v4sf) \ > - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ > - (__v8hf) (C), \ > - (__v8hf) (D), \ > - (B), (E)), \ > - (__v4sf) _mm_set_ss ((float) ((int) (B) << 31)))) > -#endif > + (B), (E))) > > #define _mm_mask3_fmadd_round_sch(A, B, C, D, E) \ > - ((__m128h) _mm_move_ss ((__m128) (C), \ > - (__m128) \ > - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ > - (__v8hf) (B), \ > - (__v8hf) (C), \ > - (D), (E)))) > + ((__m128h) \ > + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) (A), \ > + (__v8hf) (B), \ > + (__v8hf) (C), \ > + (D), (E))) > > #define _mm_maskz_fmadd_round_sch(A, B, C, D, E) \ > __builtin_ia32_vfmaddcsh_maskz_round ((B), (C), (D), (A), (E)) > diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h > index 3d3de964224..0b1f1cbe6ad 100644 > --- a/gcc/config/i386/avx512fp16vlintrin.h > +++ b/gcc/config/i386/avx512fp16vlintrin.h > @@ -2898,21 +2898,20 @@ extern __inline __m128h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask_fmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > { > - return (__m128h) __builtin_ia32_movaps128_mask > - ((__v4sf) > - __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, > - (__v8hf) __C, > - (__v8hf) __D, __B), > - (__v4sf) __A, __B); > + return (__m128h) > + __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, > + (__v8hf) __C, > + (__v8hf) __D, __B); > } > > extern __inline __m128h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask3_fmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) > { > - return (__m128h) __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, > - (__v8hf) __B, > - (__v8hf) __C, __D); > + return (__m128h) > + __builtin_ia32_vfmaddcph128_mask3 ((__v8hf) __A, > + (__v8hf) __B, > + (__v8hf) __C, __D); > } > > extern __inline __m128h > @@ -2937,21 +2936,20 @@ extern __inline __m256h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm256_mask_fmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) > { > - return (__m256h) __builtin_ia32_movaps256_mask > - ((__v8sf) > + return (__m256h) > __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A, > (__v16hf) __C, > - (__v16hf) __D, __B), > - (__v8sf) __A, __B); > + (__v16hf) __D, __B); > } > > extern __inline __m256h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm256_mask3_fmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D) > { > - return (__m256h) __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A, > - (__v16hf) __B, > - (__v16hf) __C, __D); > + return (__m256h) > + __builtin_ia32_vfmaddcph256_mask3 ((__v16hf) __A, > + (__v16hf) __B, > + (__v16hf) __C, __D); > } > > extern __inline __m256h > @@ -2976,21 +2974,20 @@ extern __inline __m128h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask_fcmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > { > - return (__m128h) __builtin_ia32_movaps128_mask > - ((__v4sf) > + return (__m128h) > __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A, > (__v8hf) __C, > - (__v8hf) __D, __B), > - (__v4sf) __A, __B); > + (__v8hf) __D, __B); > } > > extern __inline __m128h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_mask3_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) > { > - return (__m128h) __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A, > - (__v8hf) __B, > - (__v8hf) __C, __D); > + return (__m128h) > + __builtin_ia32_vfcmaddcph128_mask3 ((__v8hf) __A, > + (__v8hf) __B, > + (__v8hf) __C, __D); > } > > extern __inline __m128h > @@ -3015,21 +3012,20 @@ extern __inline __m256h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm256_mask_fcmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) > { > - return (__m256h) __builtin_ia32_movaps256_mask > - ((__v8sf) > + return (__m256h) > __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A, > (__v16hf) __C, > - (__v16hf) __D, __B), > - (__v8sf) __A, __B); > + (__v16hf) __D, __B); > } > > extern __inline __m256h > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm256_mask3_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D) > { > - return (__m256h) __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A, > - (__v16hf) __B, > - (__v16hf) __C, __D); > + return (__m256h) > + __builtin_ia32_vfcmaddcph256_mask3 ((__v16hf) __A, > + (__v16hf) __B, > + (__v16hf) __C, __D); > } > > extern __inline __m256h > diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def > index 302e1bc6502..99217d08d37 100644 > --- a/gcc/config/i386/i386-builtin.def > +++ b/gcc/config/i386/i386-builtin.def > @@ -2912,16 +2912,20 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp1 > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_mask3, "__builtin_ia32_vfnmsubph128_mask3", IX86_BUILTIN_VFNMSUBPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_maskz, "__builtin_ia32_vfnmsubph128_maskz", IX86_BUILTIN_VFNMSUBPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v8hf, "__builtin_ia32_vfmaddcph128", IX86_BUILTIN_VFMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF) > -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask, "__builtin_ia32_vfmaddcph128_mask", IX86_BUILTIN_VFMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask1, "__builtin_ia32_vfmaddcph128_mask", IX86_BUILTIN_VFMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask, "__builtin_ia32_vfmaddcph128_mask3", IX86_BUILTIN_VFMADDCPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_maskz, "__builtin_ia32_vfmaddcph128_maskz", IX86_BUILTIN_VFMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v16hf, "__builtin_ia32_vfmaddcph256", IX86_BUILTIN_VFMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF) > -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask, "__builtin_ia32_vfmaddcph256_mask", IX86_BUILTIN_VFMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask1, "__builtin_ia32_vfmaddcph256_mask", IX86_BUILTIN_VFMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask, "__builtin_ia32_vfmaddcph256_mask3", IX86_BUILTIN_VFMADDCPH256_MASK3, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_maskz, "__builtin_ia32_vfmaddcph256_maskz", IX86_BUILTIN_VFMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v8hf, "__builtin_ia32_vfcmaddcph128", IX86_BUILTIN_VFCMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF) > -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask, "__builtin_ia32_vfcmaddcph128_mask", IX86_BUILTIN_VFCMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask1, "__builtin_ia32_vfcmaddcph128_mask", IX86_BUILTIN_VFCMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask, "__builtin_ia32_vfcmaddcph128_mask3", IX86_BUILTIN_VFCMADDCPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_maskz, "__builtin_ia32_vfcmaddcph128_maskz", IX86_BUILTIN_VFCMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v16hf, "__builtin_ia32_vfcmaddcph256", IX86_BUILTIN_VFCMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF) > -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask, "__builtin_ia32_vfcmaddcph256_mask", IX86_BUILTIN_VFCMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask1, "__builtin_ia32_vfcmaddcph256_mask", IX86_BUILTIN_VFCMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask, "__builtin_ia32_vfcmaddcph256_mask3", IX86_BUILTIN_VFCMADDCPH256_MASK3, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_maskz, "__builtin_ia32_vfcmaddcph256_maskz", IX86_BUILTIN_VFCMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf, "__builtin_ia32_vfcmulcph128", IX86_BUILTIN_VFCMULCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf_mask, "__builtin_ia32_vfcmulcph128_mask", IX86_BUILTIN_VFCMULCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > @@ -3222,20 +3226,24 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_mask3_roun > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_maskz_round, "__builtin_ia32_vfnmaddsh3_maskz", IX86_BUILTIN_VFNMADDSH3_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfmsub_v8hf_mask3_round, "__builtin_ia32_vfmsubsh3_mask3", IX86_BUILTIN_VFMSUBSH3_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v32hf_round, "__builtin_ia32_vfmaddcph512_round", IX86_BUILTIN_VFMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT) > -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask_round, "__builtin_ia32_vfmaddcph512_mask_round", IX86_BUILTIN_VFMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round, "__builtin_ia32_vfmaddcph512_mask_round", IX86_BUILTIN_VFMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask_round, "__builtin_ia32_vfmaddcph512_mask3_round", IX86_BUILTIN_VFMADDCPH512_MASK3_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_maskz_round, "__builtin_ia32_vfmaddcph512_maskz_round", IX86_BUILTIN_VFMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v32hf_round, "__builtin_ia32_vfcmaddcph512_round", IX86_BUILTIN_VFCMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT) > -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask_round, "__builtin_ia32_vfcmaddcph512_mask_round", IX86_BUILTIN_VFCMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round, "__builtin_ia32_vfcmaddcph512_mask_round", IX86_BUILTIN_VFCMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask_round, "__builtin_ia32_vfcmaddcph512_mask3_round", IX86_BUILTIN_VFCMADDCPH512_MASK3_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_maskz_round, "__builtin_ia32_vfcmaddcph512_maskz_round", IX86_BUILTIN_VFCMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_round, "__builtin_ia32_vfcmulcph512_round", IX86_BUILTIN_VFCMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_mask_round, "__builtin_ia32_vfcmulcph512_mask_round", IX86_BUILTIN_VFCMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_round, "__builtin_ia32_vfmulcph512_round", IX86_BUILTIN_VFMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_mask_round, "__builtin_ia32_vfmulcph512_mask_round", IX86_BUILTIN_VFMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fcmaddcsh_v8hf_round, "__builtin_ia32_vfcmaddcsh_round", IX86_BUILTIN_VFCMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT) > -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask_round, "__builtin_ia32_vfcmaddcsh_mask_round", IX86_BUILTIN_VFCMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round, "__builtin_ia32_vfcmaddcsh_mask_round", IX86_BUILTIN_VFCMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round, "__builtin_ia32_vfcmaddcsh_mask3_round", IX86_BUILTIN_VFCMADDCSH_MASK3_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfcmaddcsh_maskz_round", IX86_BUILTIN_VFCMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fmaddcsh_v8hf_round, "__builtin_ia32_vfmaddcsh_round", IX86_BUILTIN_VFMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT) > -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask_round, "__builtin_ia32_vfmaddcsh_mask_round", IX86_BUILTIN_VFMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round, "__builtin_ia32_vfmaddcsh_mask_round", IX86_BUILTIN_VFMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round, "__builtin_ia32_vfmaddcsh_mask3_round", IX86_BUILTIN_VFMADDCSH_MASK3_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfmaddcsh_maskz_round", IX86_BUILTIN_VFMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_round, "__builtin_ia32_vfcmulcsh_round", IX86_BUILTIN_VFCMULCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT) > BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_mask_round, "__builtin_ia32_vfcmulcsh_mask_round", IX86_BUILTIN_VFCMULCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index c0924a59efb..de4fe9ce147 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -10877,7 +10877,27 @@ ix86_expand_round_builtin (const struct builtin_description *d, > > /* If there is no rounding use normal version of the pattern. */ > if (INTVAL (op) == NO_ROUND) > - redundant_embed_rnd = 1; > + { > + /* Skip erasing embedded rounding for below expanders who > + generates multiple insns. In ix86_erase_embedded_rounding > + the pattern will be transformed to a single set, and emit_insn > + appends the set insead of insert it to chain. So the insns > + emitted inside define_expander would be ignored. */ > + switch (icode) > + { > + case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round: > + case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round: > + case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round: > + case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round: > + case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round: > + case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round: > + redundant_embed_rnd = 0; > + break; > + default: > + redundant_embed_rnd = 1; > + break; > + } > + } > } > else > { > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index a3c4a3f1e62..bdd34f379dd 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -5864,6 +5864,34 @@ > (UNSPEC_COMPLEX_FMUL "fmulc") > (UNSPEC_COMPLEX_FCMUL "fcmulc")]) > > +(define_mode_attr complexmove > + [(V32HF "avx512f_loadv16sf") > + (V16HF "avx512vl_loadv8sf") > + (V8HF "avx512vl_loadv4sf")]) > + > +(define_expand "<avx512>_fmaddc_<mode>_mask1<round_expand_name>" > + [(match_operand:VF_AVX512FP16VL 0 "register_operand") > + (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") > + (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>") > + (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>") > + (match_operand:<avx512fmaskcmode> 4 "register_operand")] > + "TARGET_AVX512FP16 && <round_mode512bit_condition>" > +{ > + rtx op0, op1; > + if (<round_embedded_complex>) > + emit_insn (gen_<avx512>_fmaddc_<mode>_mask<round_expand_name> ( > + operands[0], operands[1], operands[2], operands[3], > + operands[4]<round_expand_operand>)); > + else > + emit_insn (gen_<avx512>_fmaddc_<mode>_mask (operands[0], > + operands[1], operands[2], operands[3], operands[4])); > + > + op0 = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode); > + op1 = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode); > + emit_insn (gen_<complexmove>_mask (op0, op0, op1, operands[4])); > + DONE; > +}) > + > (define_expand "<avx512>_fmaddc_<mode>_maskz<round_expand_name>" > [(match_operand:VF_AVX512FP16VL 0 "register_operand") > (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") > @@ -5878,6 +5906,31 @@ > DONE; > }) > > +(define_expand "<avx512>_fcmaddc_<mode>_mask1<round_expand_name>" > + [(match_operand:VF_AVX512FP16VL 0 "register_operand") > + (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") > + (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>") > + (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>") > + (match_operand:<avx512fmaskcmode> 4 "register_operand")] > + "TARGET_AVX512FP16 && <round_mode512bit_condition>" > +{ > + rtx op0, op1; > + if (<round_embedded_complex>) > + emit_insn (gen_<avx512>_fcmaddc_<mode>_mask<round_expand_name> ( > + operands[0], operands[1], operands[2], operands[3], > + operands[4]<round_expand_operand>)); > + else > + { > + emit_insn (gen_<avx512>_fcmaddc_<mode>_mask (operands[0], > + operands[1], operands[2], operands[3], operands[4])); > + } > + > + op0 = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode); > + op1 = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode); > + emit_insn (gen_<complexmove>_mask (op0, op0, op1, operands[4])); > + DONE; > +}) > + > (define_expand "<avx512>_fcmaddc_<mode>_maskz<round_expand_name>" > [(match_operand:VF_AVX512FP16VL 0 "register_operand") > (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") > @@ -5946,6 +5999,47 @@ > DONE; > }) > > +(define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" > + [(match_operand:V8HF 0 "register_operand") > + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") > + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") > + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") > + (match_operand:QI 4 "register_operand")] > + "TARGET_AVX512FP16 && <round_mode512bit_condition>" > +{ > + rtx op0, op1; > + > + if (<round_embedded_complex>) > + emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> ( > + operands[0], operands[1], operands[2], operands[3], > + operands[4]<round_expand_operand>)); > + else > + emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0], > + operands[1], operands[2], operands[3], operands[4])); > + > + if (TARGET_AVX512VL) > + { > + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > + emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); > + } > + else > + { > + rtx mask, tmp, vec_mask; > + mask = lowpart_subreg (SImode, operands[4], QImode), > + tmp = gen_reg_rtx (SImode); > + emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); > + vec_mask = gen_reg_rtx (V4SImode); > + emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode))); > + emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp)); > + vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode); > + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > + emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask)); > + } > + DONE; > +}) > + > (define_expand "avx512fp16_fcmaddcsh_v8hf_maskz<round_expand_name>" > [(match_operand:V8HF 0 "register_operand") > (match_operand:V8HF 1 "<round_expand_nimm_predicate>") > @@ -5960,6 +6054,95 @@ > DONE; > }) > > +(define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" > + [(match_operand:V8HF 0 "register_operand") > + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") > + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") > + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") > + (match_operand:QI 4 "register_operand")] > + "TARGET_AVX512FP16 && <round_mode512bit_condition>" > +{ > + rtx op0, op1; > + > + if (<round_embedded_complex>) > + emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> ( > + operands[0], operands[1], operands[2], operands[3], > + operands[4]<round_expand_operand>)); > + else > + emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0], > + operands[1], operands[2], operands[3], operands[4])); > + > + if (TARGET_AVX512VL) > + { > + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > + emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); > + } > + else > + { > + rtx mask, tmp, vec_mask; > + mask = lowpart_subreg (SImode, operands[4], QImode), > + tmp = gen_reg_rtx (SImode); > + emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); > + vec_mask = gen_reg_rtx (V4SImode); > + emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode))); > + emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp)); > + vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode); > + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > + emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask)); > + } > + DONE; > +}) > + > +(define_expand "avx512fp16_fcmaddcsh_v8hf_mask3<round_expand_name>" > + [(match_operand:V8HF 0 "register_operand") > + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") > + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") > + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") > + (match_operand:QI 4 "register_operand")] > + "TARGET_AVX512FP16 && <round_mode512bit_condition>" > +{ > + rtx op0, op1; > + > + if (<round_embedded_complex>) > + emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> ( > + operands[0], operands[1], operands[2], operands[3], > + operands[4]<round_expand_operand>)); > + else > + emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0], > + operands[1], operands[2], operands[3], operands[4])); > + > + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > + op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode); > + emit_insn (gen_sse_movss (op0, op1, op0)); > + DONE; > +}) > + > +(define_expand "avx512fp16_fmaddcsh_v8hf_mask3<round_expand_name>" > + [(match_operand:V8HF 0 "register_operand") > + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") > + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") > + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") > + (match_operand:QI 4 "register_operand")] > + "TARGET_AVX512FP16 && <round_mode512bit_condition>" > +{ > + rtx op0, op1; > + > + if (<round_embedded_complex>) > + emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> ( > + operands[0], operands[1], operands[2], operands[3], > + operands[4]<round_expand_operand>)); > + else > + emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0], > + operands[1], operands[2], operands[3], operands[4])); > + > + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > + op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode); > + emit_insn (gen_sse_movss (op0, op1, op0)); > + DONE; > +}) > + > (define_insn "avx512fp16_fma_<complexopname>sh_v8hf<mask_scalarcz_name><round_scalarcz_name>" > [(set (match_operand:V8HF 0 "register_operand" "=&v") > (vec_merge:V8HF > diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md > index 11e62c67fcc..510e82c84f9 100644 > --- a/gcc/config/i386/subst.md > +++ b/gcc/config/i386/subst.md > @@ -276,6 +276,9 @@ > (define_subst_attr "round_expand_name" "round_expand" "" "_round") > (define_subst_attr "round_expand_nimm_predicate" "round_expand" "nonimmediate_operand" "register_operand") > (define_subst_attr "round_expand_operand" "round_expand" "" ", operands[5]") > +(define_subst_attr "round_embedded_complex" "round_expand" "0" "!(CONST_INT_P (operands[5]) > + && (INTVAL (operands[5]) > + == NO_ROUND))") > > (define_subst "round_expand" > [(match_operand:SUBST_V 0) > diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c > index 8744aa7df55..154e7b3b107 100644 > --- a/gcc/testsuite/gcc.target/i386/avx-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx-1.c > @@ -789,9 +789,11 @@ > #define __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, E) __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, 8) > #define __builtin_ia32_vfcmaddcph512_round(A, B, C, D) __builtin_ia32_vfcmaddcph512_round(A, B, C, 8) > #define __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmaddcph512_round(A, B, C, D) __builtin_ia32_vfmaddcph512_round(A, B, C, 8) > #define __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmulcph512_round(A, B, C) __builtin_ia32_vfmulcph512_round(A, B, 8) > #define __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, 8) > @@ -799,9 +801,11 @@ > #define __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcsh_round(A, B, C, D) __builtin_ia32_vfmaddcsh_round(A, B, C, 8) > #define __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfcmaddcsh_round(A, B, C, D) __builtin_ia32_vfcmaddcsh_round(A, B, C, 8) > #define __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmulcsh_round(A, B, C) __builtin_ia32_vfmulcsh_round(A, B, 8) > #define __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, 8) > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > index 8bd8eebd8df..8ff2092c325 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > @@ -6,6 +6,8 @@ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > > #include <immintrin.h> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > new file mode 100644 > index 00000000000..79a295f722c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ > +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > + > +#include "avx512fp16-vfcmaddcsh-1a.c" > + > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > index 1e376b4a2bb..2ebe1f8ddd7 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > @@ -6,6 +6,8 @@ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > > #include <immintrin.h> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > new file mode 100644 > index 00000000000..7863f8f9af9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ > +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > + > +#include "avx512fp16-vfmaddcsh-1a.c" > + > diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c > index f6d54e3a815..e285c307d00 100644 > --- a/gcc/testsuite/gcc.target/i386/sse-13.c > +++ b/gcc/testsuite/gcc.target/i386/sse-13.c > @@ -806,9 +806,11 @@ > #define __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, E) __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, 8) > #define __builtin_ia32_vfcmaddcph512_round(A, B, C, D) __builtin_ia32_vfcmaddcph512_round(A, B, C, 8) > #define __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmaddcph512_round(A, B, C, D) __builtin_ia32_vfmaddcph512_round(A, B, C, 8) > #define __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmulcph512_round(A, B, C) __builtin_ia32_vfmulcph512_round(A, B, 8) > #define __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, 8) > @@ -816,9 +818,11 @@ > #define __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcsh_round(A, B, C, D) __builtin_ia32_vfmaddcsh_round(A, B, C, 8) > #define __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfcmaddcsh_round(A, B, C, D) __builtin_ia32_vfcmaddcsh_round(A, B, C, 8) > #define __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmulcsh_round(A, B, C) __builtin_ia32_vfmulcsh_round(A, B, 8) > #define __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, 8) > diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c > index 98251269a64..f71a7b29157 100644 > --- a/gcc/testsuite/gcc.target/i386/sse-23.c > +++ b/gcc/testsuite/gcc.target/i386/sse-23.c > @@ -807,9 +807,11 @@ > #define __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, E) __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, 8) > #define __builtin_ia32_vfcmaddcph512_round(A, B, C, D) __builtin_ia32_vfcmaddcph512_round(A, B, C, 8) > #define __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmaddcph512_round(A, B, C, D) __builtin_ia32_vfmaddcph512_round(A, B, C, 8) > #define __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmulcph512_round(A, B, C) __builtin_ia32_vfmulcph512_round(A, B, 8) > #define __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, 8) > @@ -817,9 +819,11 @@ > #define __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcsh_round(A, B, C, D) __builtin_ia32_vfmaddcsh_round(A, B, C, 8) > #define __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfcmaddcsh_round(A, B, C, D) __builtin_ia32_vfcmaddcsh_round(A, B, C, 8) > #define __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, 8) > +#define __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, 8) > #define __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, 8) > #define __builtin_ia32_vfmulcsh_round(A, B, C) __builtin_ia32_vfmulcsh_round(A, B, 8) > #define __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, 8) > -- > 2.18.1 >
diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 29cf6792335..5e49447a020 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -6258,13 +6258,11 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) { - return (__m512h) __builtin_ia32_movaps512_mask - ((__v16sf) - __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, - (__v32hf) __C, - (__v32hf) __D, __B, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __A, __B); + return (__m512h) + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + _MM_FROUND_CUR_DIRECTION); } extern __inline __m512h @@ -6272,10 +6270,10 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask3_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D) { return (__m512h) - __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, - (__v32hf) __B, - (__v32hf) __C, - __D, _MM_FROUND_CUR_DIRECTION); + __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512h @@ -6304,13 +6302,11 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_fmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) { - return (__m512h) __builtin_ia32_movaps512_mask - ((__v16sf) - __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, - (__v32hf) __C, - (__v32hf) __D, __B, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __A, __B); + return (__m512h) + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + _MM_FROUND_CUR_DIRECTION); } extern __inline __m512h @@ -6318,10 +6314,10 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask3_fmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D) { return (__m512h) - __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, - (__v32hf) __B, - (__v32hf) __C, - __D, _MM_FROUND_CUR_DIRECTION); + __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512h @@ -6352,13 +6348,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_fcmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D, const int __E) { - return (__m512h) __builtin_ia32_movaps512_mask - ((__v16sf) - __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, - (__v32hf) __C, - (__v32hf) __D, __B, - __E), - (__v16sf) __A, __B); + return (__m512h) + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + __E); } extern __inline __m512h @@ -6367,10 +6361,10 @@ _mm512_mask3_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D, const int __E) { return (__m512h) - __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, - (__v32hf) __B, - (__v32hf) __C, - __D, __E); + __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, __E); } extern __inline __m512h @@ -6401,13 +6395,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_fmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D, const int __E) { - return (__m512h) __builtin_ia32_movaps512_mask - ((__v16sf) - __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, - (__v32hf) __C, - (__v32hf) __D, __B, - __E), - (__v16sf) __A, __B); + return (__m512h) + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + __E); } extern __inline __m512h @@ -6416,10 +6408,10 @@ _mm512_mask3_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D, const int __E) { return (__m512h) - __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, - (__v32hf) __B, - (__v32hf) __C, - __D, __E); + __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, __E); } extern __inline __m512h @@ -6439,18 +6431,16 @@ _mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C, (__m512h) __builtin_ia32_vfcmaddcph512_round ((A), (B), (C), (D)) #define _mm512_mask_fcmadd_round_pch(A, B, C, D, E) \ - ((__m512h) __builtin_ia32_movaps512_mask ( \ - (__v16sf) \ + ((__m512h) \ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) (A), \ (__v32hf) (C), \ (__v32hf) (D), \ - (B), (E)), \ - (__v16sf) (A), (B))); + (B), (E))) #define _mm512_mask3_fcmadd_round_pch(A, B, C, D, E) \ ((__m512h) \ - __builtin_ia32_vfcmaddcph512_mask_round ((A), (B), (C), (D), (E))) + __builtin_ia32_vfcmaddcph512_mask3_round ((A), (B), (C), (D), (E))) #define _mm512_maskz_fcmadd_round_pch(A, B, C, D, E) \ (__m512h) \ @@ -6460,17 +6450,15 @@ _mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C, (__m512h) __builtin_ia32_vfmaddcph512_round ((A), (B), (C), (D)) #define _mm512_mask_fmadd_round_pch(A, B, C, D, E) \ - ((__m512h) __builtin_ia32_movaps512_mask ( \ - (__v16sf) \ + ((__m512h) \ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) (A), \ (__v32hf) (C), \ (__v32hf) (D), \ - (B), (E)), \ - (__v16sf) (A), (B))); + (B), (E))) #define _mm512_mask3_fmadd_round_pch(A, B, C, D, E) \ (__m512h) \ - __builtin_ia32_vfmaddcph512_mask_round ((A), (B), (C), (D), (E)) + __builtin_ia32_vfmaddcph512_mask3_round ((A), (B), (C), (D), (E)) #define _mm512_maskz_fmadd_round_pch(A, B, C, D, E) \ (__m512h) \ @@ -6643,35 +6631,22 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_fcmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { -#ifdef __AVX512VL__ - return (__m128h) __builtin_ia32_movaps128_mask ( - (__v4sf) - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, - (__v8hf) __C, - (__v8hf) __D, __B, - _MM_FROUND_CUR_DIRECTION), - (__v4sf) __A, __B); -#else - return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, - (__v4sf) + return (__m128h) __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, (__v8hf) __C, (__v8hf) __D, __B, - _MM_FROUND_CUR_DIRECTION), - (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); -#endif + _MM_FROUND_CUR_DIRECTION); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask3_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) { - return (__m128h) _mm_move_ss ((__m128) __C, - (__m128) - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, - (__v8hf) __B, - (__v8hf) __C, __D, - _MM_FROUND_CUR_DIRECTION)); + return (__m128h) + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D, + _MM_FROUND_CUR_DIRECTION); } extern __inline __m128h @@ -6700,35 +6675,22 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_fmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { -#ifdef __AVX512VL__ - return (__m128h) __builtin_ia32_movaps128_mask ( - (__v4sf) - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, - (__v8hf) __C, - (__v8hf) __D, __B, - _MM_FROUND_CUR_DIRECTION), - (__v4sf) __A, __B); -#else - return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, - (__v4sf) + return (__m128h) __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, (__v8hf) __C, (__v8hf) __D, __B, - _MM_FROUND_CUR_DIRECTION), - (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); -#endif + _MM_FROUND_CUR_DIRECTION); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask3_fmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) { - return (__m128h) _mm_move_ss ((__m128) __C, - (__m128) - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, - (__v8hf) __B, - (__v8hf) __C, __D, - _MM_FROUND_CUR_DIRECTION)); + return (__m128h) + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D, + _MM_FROUND_CUR_DIRECTION); } extern __inline __m128h @@ -6759,23 +6721,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_fcmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D, const int __E) { -#ifdef __AVX512VL__ - return (__m128h) __builtin_ia32_movaps128_mask ( - (__v4sf) - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, - (__v8hf) __C, - (__v8hf) __D, - __B, __E), - (__v4sf) __A, __B); -#else - return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, - (__v4sf) + return (__m128h) __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, (__v8hf) __C, (__v8hf) __D, - __B, __E), - (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); -#endif + __B, __E); } extern __inline __m128h @@ -6783,12 +6733,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask3_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D, const int __E) { - return (__m128h) _mm_move_ss ((__m128) __C, - (__m128) - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, - (__v8hf) __B, - (__v8hf) __C, - __D, __E)); + return (__m128h) + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D, __E); } extern __inline __m128h @@ -6819,23 +6768,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_fmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D, const int __E) { -#ifdef __AVX512VL__ - return (__m128h) __builtin_ia32_movaps128_mask ( - (__v4sf) - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, - (__v8hf) __C, - (__v8hf) __D, - __B, __E), - (__v4sf) __A, __B); -#else - return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, - (__v4sf) + return (__m128h) __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, (__v8hf) __C, (__v8hf) __D, - __B, __E), - (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); -#endif + __B, __E); } extern __inline __m128h @@ -6843,12 +6780,11 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask3_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D, const int __E) { - return (__m128h) _mm_move_ss ((__m128) __C, - (__m128) - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, - (__v8hf) __B, - (__v8hf) __C, - __D, __E)); + return (__m128h) + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D, __E); } extern __inline __m128h @@ -6874,34 +6810,20 @@ _mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D) __D); } #else -#ifdef __AVX512VL__ #define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \ - ((__m128h) __builtin_ia32_movaps128_mask ( \ - (__v4sf) \ + ((__m128h) \ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ (__v8hf) (C), \ (__v8hf) (D), \ - (B), (E)), \ - (__v4sf) (A), (B))) + (B), (E))) -#else -#define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \ - ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A), \ - (__v4sf) \ - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ - (__v8hf) (C), \ - (__v8hf) (D), \ - (B), (E)), \ - (__v4sf) _mm_set_ss ((float) ((int) (B) << 31)))) -#endif #define _mm_mask3_fcmadd_round_sch(A, B, C, D, E) \ - ((__m128h) _mm_move_ss ((__m128) (C), \ - (__m128) \ - __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ - (__v8hf) (B), \ - (__v8hf) (C), \ - (D), (E)))) + ((__m128h) \ + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) (A), \ + (__v8hf) (B), \ + (__v8hf) (C), \ + (D), (E))) #define _mm_maskz_fcmadd_round_sch(A, B, C, D, E) \ __builtin_ia32_vfcmaddcsh_maskz_round ((B), (C), (D), (A), (E)) @@ -6909,34 +6831,19 @@ _mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D) #define _mm_fcmadd_round_sch(A, B, C, D) \ __builtin_ia32_vfcmaddcsh_round ((A), (B), (C), (D)) -#ifdef __AVX512VL__ #define _mm_mask_fmadd_round_sch(A, B, C, D, E) \ - ((__m128h) __builtin_ia32_movaps128_mask ( \ - (__v4sf) \ + ((__m128h) \ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ (__v8hf) (C), \ (__v8hf) (D), \ - (B), (E)), \ - (__v4sf) (A), (B))) - -#else -#define _mm_mask_fmadd_round_sch(A, B, C, D, E) \ - ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A), \ - (__v4sf) \ - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ - (__v8hf) (C), \ - (__v8hf) (D), \ - (B), (E)), \ - (__v4sf) _mm_set_ss ((float) ((int) (B) << 31)))) -#endif + (B), (E))) #define _mm_mask3_fmadd_round_sch(A, B, C, D, E) \ - ((__m128h) _mm_move_ss ((__m128) (C), \ - (__m128) \ - __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ - (__v8hf) (B), \ - (__v8hf) (C), \ - (D), (E)))) + ((__m128h) \ + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) (A), \ + (__v8hf) (B), \ + (__v8hf) (C), \ + (D), (E))) #define _mm_maskz_fmadd_round_sch(A, B, C, D, E) \ __builtin_ia32_vfmaddcsh_maskz_round ((B), (C), (D), (A), (E)) diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h index 3d3de964224..0b1f1cbe6ad 100644 --- a/gcc/config/i386/avx512fp16vlintrin.h +++ b/gcc/config/i386/avx512fp16vlintrin.h @@ -2898,21 +2898,20 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_fmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return (__m128h) __builtin_ia32_movaps128_mask - ((__v4sf) - __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, - (__v8hf) __C, - (__v8hf) __D, __B), - (__v4sf) __A, __B); + return (__m128h) + __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask3_fmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) { - return (__m128h) __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, - (__v8hf) __B, - (__v8hf) __C, __D); + return (__m128h) + __builtin_ia32_vfmaddcph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D); } extern __inline __m128h @@ -2937,21 +2936,20 @@ extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_fmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) { - return (__m256h) __builtin_ia32_movaps256_mask - ((__v8sf) + return (__m256h) __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A, (__v16hf) __C, - (__v16hf) __D, __B), - (__v8sf) __A, __B); + (__v16hf) __D, __B); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask3_fmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D) { - return (__m256h) __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A, - (__v16hf) __B, - (__v16hf) __C, __D); + return (__m256h) + __builtin_ia32_vfmaddcph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, __D); } extern __inline __m256h @@ -2976,21 +2974,20 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_fcmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return (__m128h) __builtin_ia32_movaps128_mask - ((__v4sf) + return (__m128h) __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A, (__v8hf) __C, - (__v8hf) __D, __B), - (__v4sf) __A, __B); + (__v8hf) __D, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask3_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) { - return (__m128h) __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A, - (__v8hf) __B, - (__v8hf) __C, __D); + return (__m128h) + __builtin_ia32_vfcmaddcph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D); } extern __inline __m128h @@ -3015,21 +3012,20 @@ extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_fcmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) { - return (__m256h) __builtin_ia32_movaps256_mask - ((__v8sf) + return (__m256h) __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A, (__v16hf) __C, - (__v16hf) __D, __B), - (__v8sf) __A, __B); + (__v16hf) __D, __B); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask3_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D) { - return (__m256h) __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A, - (__v16hf) __B, - (__v16hf) __C, __D); + return (__m256h) + __builtin_ia32_vfcmaddcph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, __D); } extern __inline __m256h diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 302e1bc6502..99217d08d37 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2912,16 +2912,20 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp1 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_mask3, "__builtin_ia32_vfnmsubph128_mask3", IX86_BUILTIN_VFNMSUBPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_maskz, "__builtin_ia32_vfnmsubph128_maskz", IX86_BUILTIN_VFNMSUBPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v8hf, "__builtin_ia32_vfmaddcph128", IX86_BUILTIN_VFMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask, "__builtin_ia32_vfmaddcph128_mask", IX86_BUILTIN_VFMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask1, "__builtin_ia32_vfmaddcph128_mask", IX86_BUILTIN_VFMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask, "__builtin_ia32_vfmaddcph128_mask3", IX86_BUILTIN_VFMADDCPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_maskz, "__builtin_ia32_vfmaddcph128_maskz", IX86_BUILTIN_VFMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v16hf, "__builtin_ia32_vfmaddcph256", IX86_BUILTIN_VFMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask, "__builtin_ia32_vfmaddcph256_mask", IX86_BUILTIN_VFMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask1, "__builtin_ia32_vfmaddcph256_mask", IX86_BUILTIN_VFMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask, "__builtin_ia32_vfmaddcph256_mask3", IX86_BUILTIN_VFMADDCPH256_MASK3, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_maskz, "__builtin_ia32_vfmaddcph256_maskz", IX86_BUILTIN_VFMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v8hf, "__builtin_ia32_vfcmaddcph128", IX86_BUILTIN_VFCMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask, "__builtin_ia32_vfcmaddcph128_mask", IX86_BUILTIN_VFCMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask1, "__builtin_ia32_vfcmaddcph128_mask", IX86_BUILTIN_VFCMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask, "__builtin_ia32_vfcmaddcph128_mask3", IX86_BUILTIN_VFCMADDCPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_maskz, "__builtin_ia32_vfcmaddcph128_maskz", IX86_BUILTIN_VFCMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v16hf, "__builtin_ia32_vfcmaddcph256", IX86_BUILTIN_VFCMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask, "__builtin_ia32_vfcmaddcph256_mask", IX86_BUILTIN_VFCMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask1, "__builtin_ia32_vfcmaddcph256_mask", IX86_BUILTIN_VFCMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask, "__builtin_ia32_vfcmaddcph256_mask3", IX86_BUILTIN_VFCMADDCPH256_MASK3, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_maskz, "__builtin_ia32_vfcmaddcph256_maskz", IX86_BUILTIN_VFCMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf, "__builtin_ia32_vfcmulcph128", IX86_BUILTIN_VFCMULCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf_mask, "__builtin_ia32_vfcmulcph128_mask", IX86_BUILTIN_VFCMULCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) @@ -3222,20 +3226,24 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_mask3_roun BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_maskz_round, "__builtin_ia32_vfnmaddsh3_maskz", IX86_BUILTIN_VFNMADDSH3_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfmsub_v8hf_mask3_round, "__builtin_ia32_vfmsubsh3_mask3", IX86_BUILTIN_VFMSUBSH3_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v32hf_round, "__builtin_ia32_vfmaddcph512_round", IX86_BUILTIN_VFMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask_round, "__builtin_ia32_vfmaddcph512_mask_round", IX86_BUILTIN_VFMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round, "__builtin_ia32_vfmaddcph512_mask_round", IX86_BUILTIN_VFMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask_round, "__builtin_ia32_vfmaddcph512_mask3_round", IX86_BUILTIN_VFMADDCPH512_MASK3_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_maskz_round, "__builtin_ia32_vfmaddcph512_maskz_round", IX86_BUILTIN_VFMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v32hf_round, "__builtin_ia32_vfcmaddcph512_round", IX86_BUILTIN_VFCMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask_round, "__builtin_ia32_vfcmaddcph512_mask_round", IX86_BUILTIN_VFCMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round, "__builtin_ia32_vfcmaddcph512_mask_round", IX86_BUILTIN_VFCMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask_round, "__builtin_ia32_vfcmaddcph512_mask3_round", IX86_BUILTIN_VFCMADDCPH512_MASK3_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_maskz_round, "__builtin_ia32_vfcmaddcph512_maskz_round", IX86_BUILTIN_VFCMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_round, "__builtin_ia32_vfcmulcph512_round", IX86_BUILTIN_VFCMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_mask_round, "__builtin_ia32_vfcmulcph512_mask_round", IX86_BUILTIN_VFCMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_round, "__builtin_ia32_vfmulcph512_round", IX86_BUILTIN_VFMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_mask_round, "__builtin_ia32_vfmulcph512_mask_round", IX86_BUILTIN_VFMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fcmaddcsh_v8hf_round, "__builtin_ia32_vfcmaddcsh_round", IX86_BUILTIN_VFCMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask_round, "__builtin_ia32_vfcmaddcsh_mask_round", IX86_BUILTIN_VFCMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round, "__builtin_ia32_vfcmaddcsh_mask_round", IX86_BUILTIN_VFCMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round, "__builtin_ia32_vfcmaddcsh_mask3_round", IX86_BUILTIN_VFCMADDCSH_MASK3_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfcmaddcsh_maskz_round", IX86_BUILTIN_VFCMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fmaddcsh_v8hf_round, "__builtin_ia32_vfmaddcsh_round", IX86_BUILTIN_VFMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask_round, "__builtin_ia32_vfmaddcsh_mask_round", IX86_BUILTIN_VFMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round, "__builtin_ia32_vfmaddcsh_mask_round", IX86_BUILTIN_VFMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round, "__builtin_ia32_vfmaddcsh_mask3_round", IX86_BUILTIN_VFMADDCSH_MASK3_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfmaddcsh_maskz_round", IX86_BUILTIN_VFMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_round, "__builtin_ia32_vfcmulcsh_round", IX86_BUILTIN_VFCMULCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_mask_round, "__builtin_ia32_vfcmulcsh_mask_round", IX86_BUILTIN_VFCMULCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index c0924a59efb..de4fe9ce147 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -10877,7 +10877,27 @@ ix86_expand_round_builtin (const struct builtin_description *d, /* If there is no rounding use normal version of the pattern. */ if (INTVAL (op) == NO_ROUND) - redundant_embed_rnd = 1; + { + /* Skip erasing embedded rounding for below expanders who + generates multiple insns. In ix86_erase_embedded_rounding + the pattern will be transformed to a single set, and emit_insn + appends the set insead of insert it to chain. So the insns + emitted inside define_expander would be ignored. */ + switch (icode) + { + case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round: + case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round: + case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round: + case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round: + case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round: + case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round: + redundant_embed_rnd = 0; + break; + default: + redundant_embed_rnd = 1; + break; + } + } } else { diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a3c4a3f1e62..bdd34f379dd 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5864,6 +5864,34 @@ (UNSPEC_COMPLEX_FMUL "fmulc") (UNSPEC_COMPLEX_FCMUL "fcmulc")]) +(define_mode_attr complexmove + [(V32HF "avx512f_loadv16sf") + (V16HF "avx512vl_loadv8sf") + (V8HF "avx512vl_loadv4sf")]) + +(define_expand "<avx512>_fmaddc_<mode>_mask1<round_expand_name>" + [(match_operand:VF_AVX512FP16VL 0 "register_operand") + (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") + (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>") + (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>") + (match_operand:<avx512fmaskcmode> 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + rtx op0, op1; + if (<round_embedded_complex>) + emit_insn (gen_<avx512>_fmaddc_<mode>_mask<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + operands[4]<round_expand_operand>)); + else + emit_insn (gen_<avx512>_fmaddc_<mode>_mask (operands[0], + operands[1], operands[2], operands[3], operands[4])); + + op0 = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode); + op1 = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode); + emit_insn (gen_<complexmove>_mask (op0, op0, op1, operands[4])); + DONE; +}) + (define_expand "<avx512>_fmaddc_<mode>_maskz<round_expand_name>" [(match_operand:VF_AVX512FP16VL 0 "register_operand") (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") @@ -5878,6 +5906,31 @@ DONE; }) +(define_expand "<avx512>_fcmaddc_<mode>_mask1<round_expand_name>" + [(match_operand:VF_AVX512FP16VL 0 "register_operand") + (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") + (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>") + (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>") + (match_operand:<avx512fmaskcmode> 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + rtx op0, op1; + if (<round_embedded_complex>) + emit_insn (gen_<avx512>_fcmaddc_<mode>_mask<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + operands[4]<round_expand_operand>)); + else + { + emit_insn (gen_<avx512>_fcmaddc_<mode>_mask (operands[0], + operands[1], operands[2], operands[3], operands[4])); + } + + op0 = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode); + op1 = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode); + emit_insn (gen_<complexmove>_mask (op0, op0, op1, operands[4])); + DONE; +}) + (define_expand "<avx512>_fcmaddc_<mode>_maskz<round_expand_name>" [(match_operand:VF_AVX512FP16VL 0 "register_operand") (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") @@ -5946,6 +5999,47 @@ DONE; }) +(define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" + [(match_operand:V8HF 0 "register_operand") + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") + (match_operand:QI 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + rtx op0, op1; + + if (<round_embedded_complex>) + emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + operands[4]<round_expand_operand>)); + else + emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0], + operands[1], operands[2], operands[3], operands[4])); + + if (TARGET_AVX512VL) + { + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); + emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); + } + else + { + rtx mask, tmp, vec_mask; + mask = lowpart_subreg (SImode, operands[4], QImode), + tmp = gen_reg_rtx (SImode); + emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); + vec_mask = gen_reg_rtx (V4SImode); + emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode))); + emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp)); + vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode); + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); + emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask)); + } + DONE; +}) + (define_expand "avx512fp16_fcmaddcsh_v8hf_maskz<round_expand_name>" [(match_operand:V8HF 0 "register_operand") (match_operand:V8HF 1 "<round_expand_nimm_predicate>") @@ -5960,6 +6054,95 @@ DONE; }) +(define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" + [(match_operand:V8HF 0 "register_operand") + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") + (match_operand:QI 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + rtx op0, op1; + + if (<round_embedded_complex>) + emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + operands[4]<round_expand_operand>)); + else + emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0], + operands[1], operands[2], operands[3], operands[4])); + + if (TARGET_AVX512VL) + { + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); + emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); + } + else + { + rtx mask, tmp, vec_mask; + mask = lowpart_subreg (SImode, operands[4], QImode), + tmp = gen_reg_rtx (SImode); + emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); + vec_mask = gen_reg_rtx (V4SImode); + emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode))); + emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp)); + vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode); + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); + emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask)); + } + DONE; +}) + +(define_expand "avx512fp16_fcmaddcsh_v8hf_mask3<round_expand_name>" + [(match_operand:V8HF 0 "register_operand") + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") + (match_operand:QI 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + rtx op0, op1; + + if (<round_embedded_complex>) + emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + operands[4]<round_expand_operand>)); + else + emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0], + operands[1], operands[2], operands[3], operands[4])); + + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); + op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode); + emit_insn (gen_sse_movss (op0, op1, op0)); + DONE; +}) + +(define_expand "avx512fp16_fmaddcsh_v8hf_mask3<round_expand_name>" + [(match_operand:V8HF 0 "register_operand") + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") + (match_operand:QI 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + rtx op0, op1; + + if (<round_embedded_complex>) + emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + operands[4]<round_expand_operand>)); + else + emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0], + operands[1], operands[2], operands[3], operands[4])); + + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); + op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode); + emit_insn (gen_sse_movss (op0, op1, op0)); + DONE; +}) + (define_insn "avx512fp16_fma_<complexopname>sh_v8hf<mask_scalarcz_name><round_scalarcz_name>" [(set (match_operand:V8HF 0 "register_operand" "=&v") (vec_merge:V8HF diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 11e62c67fcc..510e82c84f9 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -276,6 +276,9 @@ (define_subst_attr "round_expand_name" "round_expand" "" "_round") (define_subst_attr "round_expand_nimm_predicate" "round_expand" "nonimmediate_operand" "register_operand") (define_subst_attr "round_expand_operand" "round_expand" "" ", operands[5]") +(define_subst_attr "round_embedded_complex" "round_expand" "0" "!(CONST_INT_P (operands[5]) + && (INTVAL (operands[5]) + == NO_ROUND))") (define_subst "round_expand" [(match_operand:SUBST_V 0) diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c index 8744aa7df55..154e7b3b107 100644 --- a/gcc/testsuite/gcc.target/i386/avx-1.c +++ b/gcc/testsuite/gcc.target/i386/avx-1.c @@ -789,9 +789,11 @@ #define __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, E) __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, 8) #define __builtin_ia32_vfcmaddcph512_round(A, B, C, D) __builtin_ia32_vfcmaddcph512_round(A, B, C, 8) #define __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmaddcph512_round(A, B, C, D) __builtin_ia32_vfmaddcph512_round(A, B, C, 8) #define __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmulcph512_round(A, B, C) __builtin_ia32_vfmulcph512_round(A, B, 8) #define __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, 8) @@ -799,9 +801,11 @@ #define __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcsh_round(A, B, C, D) __builtin_ia32_vfmaddcsh_round(A, B, C, 8) #define __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfcmaddcsh_round(A, B, C, D) __builtin_ia32_vfcmaddcsh_round(A, B, C, 8) #define __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmulcsh_round(A, B, C) __builtin_ia32_vfmulcsh_round(A, B, 8) #define __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, 8) diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c index 8bd8eebd8df..8ff2092c325 100644 --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c @@ -6,6 +6,8 @@ /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ #include <immintrin.h> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c new file mode 100644 index 00000000000..79a295f722c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ + +#include "avx512fp16-vfcmaddcsh-1a.c" + diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c index 1e376b4a2bb..2ebe1f8ddd7 100644 --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c @@ -6,6 +6,8 @@ /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ #include <immintrin.h> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c new file mode 100644 index 00000000000..7863f8f9af9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ + +#include "avx512fp16-vfmaddcsh-1a.c" + diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c index f6d54e3a815..e285c307d00 100644 --- a/gcc/testsuite/gcc.target/i386/sse-13.c +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -806,9 +806,11 @@ #define __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, E) __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, 8) #define __builtin_ia32_vfcmaddcph512_round(A, B, C, D) __builtin_ia32_vfcmaddcph512_round(A, B, C, 8) #define __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmaddcph512_round(A, B, C, D) __builtin_ia32_vfmaddcph512_round(A, B, C, 8) #define __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmulcph512_round(A, B, C) __builtin_ia32_vfmulcph512_round(A, B, 8) #define __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, 8) @@ -816,9 +818,11 @@ #define __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcsh_round(A, B, C, D) __builtin_ia32_vfmaddcsh_round(A, B, C, 8) #define __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfcmaddcsh_round(A, B, C, D) __builtin_ia32_vfcmaddcsh_round(A, B, C, 8) #define __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmulcsh_round(A, B, C) __builtin_ia32_vfmulcsh_round(A, B, 8) #define __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, 8) diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c index 98251269a64..f71a7b29157 100644 --- a/gcc/testsuite/gcc.target/i386/sse-23.c +++ b/gcc/testsuite/gcc.target/i386/sse-23.c @@ -807,9 +807,11 @@ #define __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, E) __builtin_ia32_vfnmsubsh3_maskz(A, B, C, D, 8) #define __builtin_ia32_vfcmaddcph512_round(A, B, C, D) __builtin_ia32_vfcmaddcph512_round(A, B, C, 8) #define __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcph512_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcph512_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmaddcph512_round(A, B, C, D) __builtin_ia32_vfmaddcph512_round(A, B, C, 8) #define __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcph512_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcph512_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmulcph512_round(A, B, C) __builtin_ia32_vfmulcph512_round(A, B, 8) #define __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcph512_mask_round(A, C, D, B, 8) @@ -817,9 +819,11 @@ #define __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, E) __builtin_ia32_vfcmulcph512_mask_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcsh_round(A, B, C, D) __builtin_ia32_vfmaddcsh_round(A, B, C, 8) #define __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfmaddcsh_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfmaddcsh_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfcmaddcsh_round(A, B, C, D) __builtin_ia32_vfcmaddcsh_round(A, B, C, 8) #define __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask_round(A, C, D, B, 8) +#define __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, E) __builtin_ia32_vfcmaddcsh_mask3_round(A, C, D, B, 8) #define __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, E) __builtin_ia32_vfcmaddcsh_maskz_round(B, C, D, A, 8) #define __builtin_ia32_vfmulcsh_round(A, B, C) __builtin_ia32_vfmulcsh_round(A, B, 8) #define __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, E) __builtin_ia32_vfmulcsh_mask_round(A, C, D, B, 8)