@@ -45,6 +45,178 @@ typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__));
typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__));
typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__));
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
+ _Float16 __A4, _Float16 __A3, _Float16 __A2,
+ _Float16 __A1, _Float16 __A0)
+{
+ return __extension__ (__m128h)(__v8hf){ __A0, __A1, __A2, __A3,
+ __A4, __A5, __A6, __A7 };
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_ph (_Float16 __A15, _Float16 __A14, _Float16 __A13,
+ _Float16 __A12, _Float16 __A11, _Float16 __A10,
+ _Float16 __A9, _Float16 __A8, _Float16 __A7,
+ _Float16 __A6, _Float16 __A5, _Float16 __A4,
+ _Float16 __A3, _Float16 __A2, _Float16 __A1,
+ _Float16 __A0)
+{
+ return __extension__ (__m256h)(__v16hf){ __A0, __A1, __A2, __A3,
+ __A4, __A5, __A6, __A7,
+ __A8, __A9, __A10, __A11,
+ __A12, __A13, __A14, __A15 };
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_ph (_Float16 __A31, _Float16 __A30, _Float16 __A29,
+ _Float16 __A28, _Float16 __A27, _Float16 __A26,
+ _Float16 __A25, _Float16 __A24, _Float16 __A23,
+ _Float16 __A22, _Float16 __A21, _Float16 __A20,
+ _Float16 __A19, _Float16 __A18, _Float16 __A17,
+ _Float16 __A16, _Float16 __A15, _Float16 __A14,
+ _Float16 __A13, _Float16 __A12, _Float16 __A11,
+ _Float16 __A10, _Float16 __A9, _Float16 __A8,
+ _Float16 __A7, _Float16 __A6, _Float16 __A5,
+ _Float16 __A4, _Float16 __A3, _Float16 __A2,
+ _Float16 __A1, _Float16 __A0)
+{
+ return __extension__ (__m512h)(__v32hf){ __A0, __A1, __A2, __A3,
+ __A4, __A5, __A6, __A7,
+ __A8, __A9, __A10, __A11,
+ __A12, __A13, __A14, __A15,
+ __A16, __A17, __A18, __A19,
+ __A20, __A21, __A22, __A23,
+ __A24, __A25, __A26, __A27,
+ __A28, __A29, __A30, __A31 };
+}
+
+/* Create vectors of elements in the reversed order from _mm_set_ph,
+ _mm256_set_ph and _mm512_set_ph functions. */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+ _Float16 __A3, _Float16 __A4, _Float16 __A5,
+ _Float16 __A6, _Float16 __A7)
+{
+ return _mm_set_ph (__A7, __A6, __A5, __A4, __A3, __A2, __A1, __A0);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+ _Float16 __A3, _Float16 __A4, _Float16 __A5,
+ _Float16 __A6, _Float16 __A7, _Float16 __A8,
+ _Float16 __A9, _Float16 __A10, _Float16 __A11,
+ _Float16 __A12, _Float16 __A13, _Float16 __A14,
+ _Float16 __A15)
+{
+ return _mm256_set_ph (__A15, __A14, __A13, __A12, __A11, __A10, __A9,
+ __A8, __A7, __A6, __A5, __A4, __A3, __A2, __A1,
+ __A0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+ _Float16 __A3, _Float16 __A4, _Float16 __A5,
+ _Float16 __A6, _Float16 __A7, _Float16 __A8,
+ _Float16 __A9, _Float16 __A10, _Float16 __A11,
+ _Float16 __A12, _Float16 __A13, _Float16 __A14,
+ _Float16 __A15, _Float16 __A16, _Float16 __A17,
+ _Float16 __A18, _Float16 __A19, _Float16 __A20,
+ _Float16 __A21, _Float16 __A22, _Float16 __A23,
+ _Float16 __A24, _Float16 __A25, _Float16 __A26,
+ _Float16 __A27, _Float16 __A28, _Float16 __A29,
+ _Float16 __A30, _Float16 __A31)
+
+{
+ return _mm512_set_ph (__A31, __A30, __A29, __A28, __A27, __A26, __A25,
+ __A24, __A23, __A22, __A21, __A20, __A19, __A18,
+ __A17, __A16, __A15, __A14, __A13, __A12, __A11,
+ __A10, __A9, __A8, __A7, __A6, __A5, __A4, __A3,
+ __A2, __A1, __A0);
+}
+
+/* Broadcast _Float16 to vector. */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ph (_Float16 __A)
+{
+ return _mm_set_ph (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_ph (_Float16 __A)
+{
+ return _mm256_set_ph (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_ph (_Float16 __A)
+{
+ return _mm512_set_ph (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+/* Create a vector with all zeros. */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ph (void)
+{
+ return _mm_set1_ph (0.0f);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_ph (void)
+{
+ return _mm256_set1_ph (0.0f);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_ph (void)
+{
+ return _mm512_set1_ph (0.0f);
+}
+
+/* Create a vector with element 0 as F and the rest zero. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sh (_Float16 __F)
+{
+ return _mm_set_ph (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, __F);
+}
+
+/* Create a vector with element 0 as *P and the rest zero. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sh (void const *__P)
+{
+ return _mm_set_ph (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ *(_Float16 const *) __P);
+}
+
+/* Stores the lower _Float16 value. */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sh (void *__P, __m128h __A)
+{
+ *(_Float16 *) __P = ((__v8hf)__A)[0];
+}
+
#ifdef __DISABLE_AVX512FP16__
#undef __DISABLE_AVX512FP16__
#pragma GCC pop_options
@@ -13914,6 +13914,11 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
}
return true;
+ case E_V8HFmode:
+ case E_V16HFmode:
+ case E_V32HFmode:
+ return ix86_vector_duplicate_value (mode, target, val);
+
default:
return false;
}
@@ -13998,6 +14003,18 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
gen_vec_set_0 = gen_vec_setv8di_0;
break;
+ case E_V8HFmode:
+ use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv8hf_0;
+ break;
+ case E_V16HFmode:
+ use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv16hf_0;
+ break;
+ case E_V32HFmode:
+ use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv32hf_0;
+ break;
default:
break;
}
@@ -14147,6 +14164,7 @@ ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
if (!TARGET_64BIT)
return false;
/* FALLTHRU */
+ case E_V8HFmode:
case E_V4DFmode:
case E_V8SFmode:
case E_V8SImode:
@@ -14381,13 +14399,22 @@ ix86_expand_vector_init_interleave (machine_mode mode,
{
machine_mode first_imode, second_imode, third_imode, inner_mode;
int i, j;
- rtx op0, op1;
+ rtx op, op0, op1;
rtx (*gen_load_even) (rtx, rtx, rtx);
rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
switch (mode)
{
+ case E_V8HFmode:
+ gen_load_even = gen_vec_setv8hf;
+ gen_interleave_first_low = gen_vec_interleave_lowv4si;
+ gen_interleave_second_low = gen_vec_interleave_lowv2di;
+ inner_mode = HFmode;
+ first_imode = V4SImode;
+ second_imode = V2DImode;
+ third_imode = VOIDmode;
+ break;
case E_V8HImode:
gen_load_even = gen_vec_setv8hi;
gen_interleave_first_low = gen_vec_interleave_lowv4si;
@@ -14412,9 +14439,19 @@ ix86_expand_vector_init_interleave (machine_mode mode,
for (i = 0; i < n; i++)
{
+ op = ops [i + i];
+ if (inner_mode == HFmode)
+ {
+ /* Convert HFmode to HImode. */
+ op1 = gen_reg_rtx (HImode);
+ op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0);
+ op = gen_reg_rtx (HImode);
+ emit_move_insn (op, op1);
+ }
+
/* Extend the odd elment to SImode using a paradoxical SUBREG. */
op0 = gen_reg_rtx (SImode);
- emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
+ emit_move_insn (op0, gen_lowpart (SImode, op));
/* Insert the SImode value as low element of V4SImode vector. */
op1 = gen_reg_rtx (V4SImode);
@@ -14551,6 +14588,10 @@ ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
half_mode = V8HImode;
goto half;
+ case E_V16HFmode:
+ half_mode = V8HFmode;
+ goto half;
+
half:
n = GET_MODE_NUNITS (mode);
for (i = 0; i < n; i++)
@@ -14574,6 +14615,11 @@ half:
half_mode = V16HImode;
goto quarter;
+ case E_V32HFmode:
+ quarter_mode = V8HFmode;
+ half_mode = V16HFmode;
+ goto quarter;
+
quarter:
n = GET_MODE_NUNITS (mode);
for (i = 0; i < n; i++)
@@ -14610,6 +14656,9 @@ quarter:
move from GPR to SSE register directly. */
if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
break;
+ /* FALLTHRU */
+
+ case E_V8HFmode:
n = GET_MODE_NUNITS (mode);
for (i = 0; i < n; i++)
@@ -15076,6 +15125,10 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
}
return;
+ case E_V8HFmode:
+ use_vec_merge = true;
+ break;
+
case E_V8HImode:
case E_V2HImode:
use_vec_merge = TARGET_SSE2;
@@ -15550,6 +15603,28 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
ix86_expand_vector_extract (false, target, tmp, elt & 3);
return;
+ case E_V32HFmode:
+ tmp = gen_reg_rtx (V16HFmode);
+ if (elt < 16)
+ emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 15);
+ return;
+
+ case E_V16HFmode:
+ tmp = gen_reg_rtx (V8HFmode);
+ if (elt < 8)
+ emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 7);
+ return;
+
+ case E_V8HFmode:
+ use_vec_extr = true;
+ break;
+
case E_V8QImode:
use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
/* ??? Could extract the appropriate HImode element and shift. */
@@ -84,12 +84,12 @@ VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */
VECTOR_MODES (INT, 32); /* V32QI V16HI V8SI V4DI */
VECTOR_MODES (INT, 64); /* V64QI V32HI V16SI V8DI */
VECTOR_MODES (INT, 128); /* V128QI V64HI V32SI V16DI */
-VECTOR_MODES (FLOAT, 8); /* V2SF */
-VECTOR_MODES (FLOAT, 16); /* V4SF V2DF */
-VECTOR_MODES (FLOAT, 32); /* V8SF V4DF V2TF */
-VECTOR_MODES (FLOAT, 64); /* V16SF V8DF V4TF */
-VECTOR_MODES (FLOAT, 128); /* V32SF V16DF V8TF */
-VECTOR_MODES (FLOAT, 256); /* V64SF V32DF V16TF */
+VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */
+VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */
+VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */
+VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */
+VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */
+VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */
VECTOR_MODE (INT, TI, 1); /* V1TI */
VECTOR_MODE (INT, DI, 1); /* V1DI */
VECTOR_MODE (INT, SI, 1); /* V1SI */
@@ -2404,6 +2404,7 @@ classify_argument (machine_mode mode, const_tree type,
case E_V8SFmode:
case E_V8SImode:
case E_V32QImode:
+ case E_V16HFmode:
case E_V16HImode:
case E_V4DFmode:
case E_V4DImode:
@@ -2414,6 +2415,7 @@ classify_argument (machine_mode mode, const_tree type,
return 4;
case E_V8DFmode:
case E_V16SFmode:
+ case E_V32HFmode:
case E_V8DImode:
case E_V16SImode:
case E_V32HImode:
@@ -2431,6 +2433,7 @@ classify_argument (machine_mode mode, const_tree type,
case E_V4SImode:
case E_V16QImode:
case E_V8HImode:
+ case E_V8HFmode:
case E_V2DFmode:
case E_V2DImode:
classes[0] = X86_64_SSE_CLASS;
@@ -19102,9 +19105,11 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
if (!TARGET_SSE2)
return true;
- /* Between SSE and general, we have moves no larger than word size. */
+ /* Between SSE and general, we have moves no larger than word size
+ except for AVX512FP16, VMOVW enable 16bits movement. */
if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2))
- || GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)
+ || GET_MODE_SIZE (mode) < GET_MODE_SIZE (TARGET_AVX512FP16
+ ? HImode : SImode)
|| GET_MODE_SIZE (mode) > UNITS_PER_WORD)
return true;
@@ -19552,6 +19557,14 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
|| VALID_AVX512F_SCALAR_MODE (mode)))
return true;
+ /* Allow HF vector modes for AVX512FP16. NB: Since HF vector
+ moves are implemented as integer vector moves, we allow
+ V8HFmode and V16HFmode without AVX512VL in xmm0-xmm15. */
+ if (TARGET_AVX512FP16 && VALID_AVX512FP16_REG_MODE (mode))
+ return (mode == V32HFmode
+ || TARGET_AVX512VL
+ || !EXT_REX_SSE_REGNO_P (regno));
+
/* For AVX-5124FMAPS or AVX-5124VNNIW
allow V64SF and V64SI modes for special regnos. */
if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
@@ -21663,6 +21676,8 @@ ix86_vector_mode_supported_p (machine_mode mode)
if ((TARGET_MMX || TARGET_MMX_WITH_SSE)
&& VALID_MMX_REG_MODE (mode))
return true;
+ if (TARGET_AVX512FP16 && VALID_AVX512FP16_REG_MODE (mode))
+ return true;
if ((TARGET_3DNOW || TARGET_MMX_WITH_SSE)
&& VALID_MMX_REG_MODE_3DNOW (mode))
return true;
@@ -496,8 +496,8 @@ (define_attr "type"
;; Main data type used by the insn
(define_attr "mode"
- "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V16SF,V8SF,V4DF,V4SF,
- V2DF,V2SF,V1DF,V8DF"
+ "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
+ V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
(const_string "unknown"))
;; The CPU unit operations uses.
@@ -1098,7 +1098,8 @@ (define_mode_attr MODE_SIZE [(QI "1") (HI "2") (SI "4") (DI "8")
(V2DI "16") (V4DI "32") (V8DI "64")
(V1TI "16") (V2TI "32") (V4TI "64")
(V2DF "16") (V4DF "32") (V8DF "64")
- (V4SF "16") (V8SF "32") (V16SF "64")])
+ (V4SF "16") (V8SF "32") (V16SF "64")
+ (V8HF "16") (V16HF "32") (V32HF "64")])
;; Double word integer modes as mode attribute.
(define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
@@ -1239,9 +1240,9 @@ (define_mode_attr ssevecmodef [(SF "V4SF") (DF "V2DF") (TF "TF")])
;; SSE instruction suffix for various modes
(define_mode_attr ssemodesuffix
[(HF "sh") (SF "ss") (DF "sd")
- (V16SF "ps") (V8DF "pd")
- (V8SF "ps") (V4DF "pd")
- (V4SF "ps") (V2DF "pd")
+ (V32HF "ph") (V16SF "ps") (V8DF "pd")
+ (V16HF "ph") (V8SF "ps") (V4DF "pd")
+ (V8HF "ph") (V4SF "ps") (V2DF "pd")
(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")
(V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q")
(V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q")])
@@ -225,6 +225,8 @@ (define_mode_iterator VMOVE
(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI
(V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX") V1TI
+ (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16")
+ (V8HF "TARGET_AVX512FP16")
(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
@@ -240,6 +242,13 @@ (define_mode_iterator VI12_AVX512VL
[V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+(define_mode_iterator VI12HF_AVX512VL
+ [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
+ V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")
+ (V32HF "TARGET_AVX512FP16")
+ (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")])
+
;; Same iterator, but without supposed TARGET_AVX512BW
(define_mode_iterator VI12_AVX512VLBW
[(V64QI "TARGET_AVX512BW") (V16QI "TARGET_AVX512VL")
@@ -255,6 +264,7 @@ (define_mode_iterator V
(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI
+ (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") V8HF
(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
@@ -277,7 +287,8 @@ (define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF])
(define_mode_iterator V_256_512
[V32QI V16HI V8SI V4DI V8SF V4DF
(V64QI "TARGET_AVX512F") (V32HI "TARGET_AVX512F") (V16SI "TARGET_AVX512F")
- (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V16HF "TARGET_AVX512FP16") (V32HF "TARGET_AVX512FP16")])
;; All vector float modes
(define_mode_iterator VF
@@ -352,6 +363,9 @@ (define_mode_iterator VF2_AVX512VL
(define_mode_iterator VF1_AVX512VL
[V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
+(define_mode_iterator VF_AVX512FP16
+ [V32HF V16HF V8HF])
+
;; All vector integer modes
(define_mode_iterator VI
[(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
@@ -360,6 +374,16 @@ (define_mode_iterator VI
(V8SI "TARGET_AVX") V4SI
(V4DI "TARGET_AVX") V2DI])
+;; All vector integer and HF modes
+(define_mode_iterator VIHF
+ [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX") V16QI
+ (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX") V8HI
+ (V8SI "TARGET_AVX") V4SI
+ (V4DI "TARGET_AVX") V2DI
+ (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16")
+ (V8HF "TARGET_AVX512FP16")])
+
(define_mode_iterator VI_AVX2
[(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI
(V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI
@@ -562,6 +586,7 @@ (define_mode_attr avx512
(V8HI "avx512vl") (V16HI "avx512vl") (V32HI "avx512bw")
(V4SI "avx512vl") (V8SI "avx512vl") (V16SI "avx512f")
(V2DI "avx512vl") (V4DI "avx512vl") (V8DI "avx512f")
+ (V8HF "avx512fp16") (V16HF "avx512vl") (V32HF "avx512bw")
(V4SF "avx512vl") (V8SF "avx512vl") (V16SF "avx512f")
(V2DF "avx512vl") (V4DF "avx512vl") (V8DF "avx512f")])
@@ -622,12 +647,13 @@ (define_mode_attr avx2_avx512
(V8HI "avx512vl") (V16HI "avx512vl") (V32HI "avx512bw")])
(define_mode_attr shuffletype
- [(V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i")
- (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i")
- (V4SF "f") (V4SI "i") (V2DF "f") (V2DI "i")
- (V32HI "i") (V16HI "i") (V8HI "i")
- (V64QI "i") (V32QI "i") (V16QI "i")
- (V4TI "i") (V2TI "i") (V1TI "i")])
+ [(V32HF "f") (V16HF "f") (V8HF "f")
+ (V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i")
+ (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i")
+ (V4SF "f") (V4SI "i") (V2DF "f") (V2DI "i")
+ (V32HI "i") (V16HI "i") (V8HI "i")
+ (V64QI "i") (V32QI "i") (V16QI "i")
+ (V4TI "i") (V2TI "i") (V1TI "i")])
(define_mode_attr ssequartermode
[(V16SF "V4SF") (V8DF "V2DF") (V16SI "V4SI") (V8DI "V2DI")])
@@ -664,6 +690,8 @@ (define_mode_iterator VI_256 [V32QI V16HI V8SI V4DI])
;; All 128 and 256bit vector integer modes
(define_mode_iterator VI_128_256 [V16QI V8HI V4SI V2DI V32QI V16HI V8SI V4DI])
+;; All 256bit vector integer and HF modes
+(define_mode_iterator VIHF_256 [V32QI V16HI V8SI V4DI V16HF])
;; Various 128bit vector integer mode combinations
(define_mode_iterator VI12_128 [V16QI V8HI])
@@ -685,6 +713,9 @@ (define_mode_iterator VI48_512 [V16SI V8DI])
(define_mode_iterator VI4_256_8_512 [V8SI V8DI])
(define_mode_iterator VI_AVX512BW
[V16SI V8DI (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")])
+(define_mode_iterator VIHF_AVX512BW
+ [V16SI V8DI (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")
+ (V32HF "TARGET_AVX512FP16")])
;; Int-float size matches
(define_mode_iterator VI4F_128 [V4SI V4SF])
@@ -725,6 +756,9 @@ (define_mode_iterator VF_AVX512
(V8SF "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
V16SF V8DF])
+(define_mode_iterator V16_256 [V16HI V16HF])
+(define_mode_iterator V32_512 [V32HI V32HF])
+
(define_mode_attr avx512bcst
[(V4SI "%{1to4%}") (V2DI "%{1to2%}")
(V8SI "%{1to8%}") (V4DI "%{1to4%}")
@@ -774,8 +808,16 @@ (define_mode_attr sseinsnmode
(V16SF "V16SF") (V8DF "V8DF")
(V8SF "V8SF") (V4DF "V4DF")
(V4SF "V4SF") (V2DF "V2DF")
+ (V8HF "TI") (V16HF "OI") (V32HF "XI")
(TI "TI")])
+;; SSE integer instruction suffix for various modes
+(define_mode_attr sseintmodesuffix
+ [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")
+ (V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q")
+ (V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q")
+ (V8HF "w") (V16HF "w") (V32HF "w")])
+
;; Mapping of vector modes to corresponding mask size
(define_mode_attr avx512fmaskmode
[(V64QI "DI") (V32QI "SI") (V16QI "HI")
@@ -835,7 +877,8 @@ (define_mode_attr ssedoublevecmode
(V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI")
(V16SF "V32SF") (V8DF "V16DF")
(V8SF "V16SF") (V4DF "V8DF")
- (V4SF "V8SF") (V2DF "V4DF")])
+ (V4SF "V8SF") (V2DF "V4DF")
+ (V32HF "V64HF") (V16HF "V32HF") (V8HF "V16HF")])
;; Mapping of vector modes to a vector mode of half size
;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar.
@@ -845,7 +888,8 @@ (define_mode_attr ssehalfvecmode
(V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI")
(V16SF "V8SF") (V8DF "V4DF")
(V8SF "V4SF") (V4DF "V2DF")
- (V4SF "V2SF") (V2DF "DF")])
+ (V4SF "V2SF") (V2DF "DF")
+ (V32HF "V16HF") (V16HF "V8HF") (V8HF "V4HF")])
(define_mode_attr ssehalfvecmodelower
[(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti")
@@ -853,9 +897,10 @@ (define_mode_attr ssehalfvecmodelower
(V16QI "v8qi") (V8HI "v4hi") (V4SI "v2si")
(V16SF "v8sf") (V8DF "v4df")
(V8SF "v4sf") (V4DF "v2df")
- (V4SF "v2sf")])
+ (V4SF "v2sf")
+ (V32HF "v16hf") (V16HF "v8hf") (V8HF "v4hf")])
-;; Mapping of vector modes ti packed single mode of the same size
+;; Mapping of vector modes to packed single mode of the same size
(define_mode_attr ssePSmode
[(V16SI "V16SF") (V8DF "V16SF")
(V16SF "V16SF") (V8DI "V16SF")
@@ -865,7 +910,8 @@ (define_mode_attr ssePSmode
(V4DI "V8SF") (V2DI "V4SF")
(V4TI "V16SF") (V2TI "V8SF") (V1TI "V4SF")
(V8SF "V8SF") (V4SF "V4SF")
- (V4DF "V8SF") (V2DF "V4SF")])
+ (V4DF "V8SF") (V2DF "V4SF")
+ (V32HF "V16SF") (V16HF "V8SF") (V8HF "V4SF")])
(define_mode_attr ssePSmode2
[(V8DI "V8SF") (V4DI "V4SF")])
@@ -887,6 +933,7 @@ (define_mode_attr ssescalarmodelower
(V32HI "hi") (V16HI "hi") (V8HI "hi")
(V16SI "si") (V8SI "si") (V4SI "si")
(V8DI "di") (V4DI "di") (V2DI "di")
+ (V32HF "hf") (V16HF "hf") (V8HF "hf")
(V16SF "sf") (V8SF "sf") (V4SF "sf")
(V8DF "df") (V4DF "df") (V2DF "df")
(V4TI "ti") (V2TI "ti")])
@@ -897,6 +944,7 @@ (define_mode_attr ssexmmmode
(V32HI "V8HI") (V16HI "V8HI") (V8HI "V8HI")
(V16SI "V4SI") (V8SI "V4SI") (V4SI "V4SI")
(V8DI "V2DI") (V4DI "V2DI") (V2DI "V2DI")
+ (V32HF "V8HF") (V16HF "V8HF") (V8HF "V8HF")
(V16SF "V4SF") (V8SF "V4SF") (V4SF "V4SF")
(V8DF "V2DF") (V4DF "V2DF") (V2DF "V2DF")])
@@ -939,10 +987,11 @@ (define_mode_attr ssescalarsize
(V64QI "8") (V32QI "8") (V16QI "8")
(V32HI "16") (V16HI "16") (V8HI "16")
(V16SI "32") (V8SI "32") (V4SI "32")
+ (V32HF "16") (V16HF "16") (V8HF "16")
(V16SF "32") (V8SF "32") (V4SF "32")
(V8DF "64") (V4DF "64") (V2DF "64")])
-;; SSE prefix for integer vector modes
+;; SSE prefix for integer and HF vector modes
(define_mode_attr sseintprefix
[(V2DI "p") (V2DF "")
(V4DI "p") (V4DF "")
@@ -950,9 +999,9 @@ (define_mode_attr sseintprefix
(V4SI "p") (V4SF "")
(V8SI "p") (V8SF "")
(V16SI "p") (V16SF "")
- (V16QI "p") (V8HI "p")
- (V32QI "p") (V16HI "p")
- (V64QI "p") (V32HI "p")])
+ (V16QI "p") (V8HI "p") (V8HF "p")
+ (V32QI "p") (V16HI "p") (V16HF "p")
+ (V64QI "p") (V32HI "p") (V32HF "p")])
;; SSE scalar suffix for vector modes
(define_mode_attr ssescalarmodesuffix
@@ -987,7 +1036,8 @@ (define_mode_attr castmode
;; i128 for integer vectors and TARGET_AVX2, f128 otherwise.
;; i64x4 or f64x4 for 512bit modes.
(define_mode_attr i128
- [(V16SF "f64x4") (V8SF "f128") (V8DF "f64x4") (V4DF "f128")
+ [(V16HF "%~128") (V32HF "i64x4") (V16SF "f64x4") (V8SF "f128")
+ (V8DF "f64x4") (V4DF "f128")
(V64QI "i64x4") (V32QI "%~128") (V32HI "i64x4") (V16HI "%~128")
(V16SI "i64x4") (V8SI "%~128") (V8DI "i64x4") (V4DI "%~128")])
@@ -1011,14 +1061,18 @@ (define_mode_attr bcstscalarsuff
(V32HI "w") (V16HI "w") (V8HI "w")
(V16SI "d") (V8SI "d") (V4SI "d")
(V8DI "q") (V4DI "q") (V2DI "q")
+ (V32HF "w") (V16HF "w") (V8HF "w")
(V16SF "ss") (V8SF "ss") (V4SF "ss")
(V8DF "sd") (V4DF "sd") (V2DF "sd")])
;; Tie mode of assembler operand to mode iterator
(define_mode_attr xtg_mode
- [(V16QI "x") (V8HI "x") (V4SI "x") (V2DI "x") (V4SF "x") (V2DF "x")
- (V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") (V8SF "t") (V4DF "t")
- (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") (V16SF "g") (V8DF "g")])
+ [(V16QI "x") (V8HI "x") (V4SI "x") (V2DI "x")
+ (V8HF "x") (V4SF "x") (V2DF "x")
+ (V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t")
+ (V16HF "t") (V8SF "t") (V4DF "t")
+ (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g")
+ (V32HF "g") (V16SF "g") (V8DF "g")])
;; Half mask mode for unpacks
(define_mode_attr HALFMASKMODE
@@ -8353,6 +8407,45 @@ (define_insn "vec_set<mode>_0"
]
(symbol_ref "true")))])
+;; vmovw clears also the higer bits
+(define_insn "vec_set<mode>_0"
+ [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v")
+ (vec_merge:VF_AVX512FP16
+ (vec_duplicate:VF_AVX512FP16
+ (match_operand:HF 2 "nonimmediate_operand" "rm"))
+ (match_operand:VF_AVX512FP16 1 "const0_operand" "C")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vmovw\t{%2, %x0|%x0, %2}"
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")])
+
+(define_insn "*avx512fp16_movsh"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (vec_duplicate:V8HF
+ (match_operand:HF 2 "register_operand" "v"))
+ (match_operand:V8HF 1 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vmovsh\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")])
+
+(define_insn "avx512fp16_movsh"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (match_operand:V8HF 2 "register_operand" "v")
+ (match_operand:V8HF 1 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vmovsh\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")])
+
;; A subset is vec_setv4sf.
(define_insn "*vec_setv4sf_sse4_1"
[(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
@@ -9189,10 +9282,10 @@ (define_insn "vec_extract_hi_<mode>"
(set_attr "length_immediate" "1")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn_and_split "vec_extract_lo_v32hi"
- [(set (match_operand:V16HI 0 "nonimmediate_operand" "=v,v,m")
- (vec_select:V16HI
- (match_operand:V32HI 1 "nonimmediate_operand" "v,m,v")
+(define_insn_and_split "vec_extract_lo_<mode>"
+ [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=v,v,m")
+ (vec_select:<ssehalfvecmode>
+ (match_operand:V32_512 1 "nonimmediate_operand" "v,m,v")
(parallel [(const_int 0) (const_int 1)
(const_int 2) (const_int 3)
(const_int 4) (const_int 5)
@@ -9219,9 +9312,10 @@ (define_insn_and_split "vec_extract_lo_v32hi"
if (!TARGET_AVX512VL
&& REG_P (operands[0])
&& EXT_REX_SSE_REG_P (operands[1]))
- operands[0] = lowpart_subreg (V32HImode, operands[0], V16HImode);
+ operands[0] = lowpart_subreg (<MODE>mode, operands[0],
+ <ssehalfvecmode>mode);
else
- operands[1] = gen_lowpart (V16HImode, operands[1]);
+ operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
}
[(set_attr "type" "sselog1")
(set_attr "prefix_extra" "1")
@@ -9230,10 +9324,10 @@ (define_insn_and_split "vec_extract_lo_v32hi"
(set_attr "prefix" "evex")
(set_attr "mode" "XI")])
-(define_insn "vec_extract_hi_v32hi"
- [(set (match_operand:V16HI 0 "nonimmediate_operand" "=vm")
- (vec_select:V16HI
- (match_operand:V32HI 1 "register_operand" "v")
+(define_insn "vec_extract_hi_<mode>"
+ [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=vm")
+ (vec_select:<ssehalfvecmode>
+ (match_operand:V32_512 1 "register_operand" "v")
(parallel [(const_int 16) (const_int 17)
(const_int 18) (const_int 19)
(const_int 20) (const_int 21)
@@ -9250,10 +9344,10 @@ (define_insn "vec_extract_hi_v32hi"
(set_attr "prefix" "evex")
(set_attr "mode" "XI")])
-(define_insn_and_split "vec_extract_lo_v16hi"
- [(set (match_operand:V8HI 0 "nonimmediate_operand" "=v,m")
- (vec_select:V8HI
- (match_operand:V16HI 1 "nonimmediate_operand" "vm,v")
+(define_insn_and_split "vec_extract_lo_<mode>"
+ [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=v,m")
+ (vec_select:<ssehalfvecmode>
+ (match_operand:V16_256 1 "nonimmediate_operand" "vm,v")
(parallel [(const_int 0) (const_int 1)
(const_int 2) (const_int 3)
(const_int 4) (const_int 5)
@@ -9262,12 +9356,12 @@ (define_insn_and_split "vec_extract_lo_v16hi"
"#"
"&& reload_completed"
[(set (match_dup 0) (match_dup 1))]
- "operands[1] = gen_lowpart (V8HImode, operands[1]);")
+ "operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);")
-(define_insn "vec_extract_hi_v16hi"
- [(set (match_operand:V8HI 0 "nonimmediate_operand" "=xm,vm,vm")
- (vec_select:V8HI
- (match_operand:V16HI 1 "register_operand" "x,v,v")
+(define_insn "vec_extract_hi_<mode>"
+ [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=xm,vm,vm")
+ (vec_select:<ssehalfvecmode>
+ (match_operand:V16_256 1 "register_operand" "x,v,v")
(parallel [(const_int 8) (const_int 9)
(const_int 10) (const_int 11)
(const_int 12) (const_int 13)
@@ -9403,12 +9497,41 @@ (define_insn "vec_extract_hi_v32qi"
(set_attr "prefix" "vex,evex,evex")
(set_attr "mode" "OI")])
+;; NB: *vec_extract<mode>_0 must be placed before *vec_extracthf.
+;; Otherwise, it will be ignored.
+(define_insn_and_split "*vec_extract<mode>_0"
+ [(set (match_operand:HF 0 "nonimmediate_operand" "=v,m,r")
+ (vec_select:HF
+ (match_operand:VF_AVX512FP16 1 "nonimmediate_operand" "vm,v,m")
+ (parallel [(const_int 0)])))]
+ "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (match_dup 1))]
+ "operands[1] = gen_lowpart (HFmode, operands[1]);")
+
+(define_insn "*vec_extracthf"
+ [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=r,m")
+ (vec_select:HF
+ (match_operand:V8HF 1 "register_operand" "v,v")
+ (parallel
+ [(match_operand:SI 2 "const_0_to_7_operand")])))]
+ "TARGET_AVX512FP16"
+ "@
+ vpextrw\t{%2, %1, %k0|%k0, %1, %2}
+ vpextrw\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "type" "sselog1")
+ (set_attr "prefix" "maybe_evex")
+ (set_attr "mode" "TI")])
+
;; Modes handled by vec_extract patterns.
(define_mode_iterator VEC_EXTRACT_MODE
[(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX") V16QI
(V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX") V8HI
(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI
+ (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16")
+ (V8HF "TARGET_AVX512FP16")
(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF
(V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")])
@@ -14639,16 +14762,16 @@ (define_expand "vec_interleave_low<mode>"
;; Modes handled by pinsr patterns.
(define_mode_iterator PINSR_MODE
- [(V16QI "TARGET_SSE4_1") V8HI
+ [(V16QI "TARGET_SSE4_1") V8HI (V8HF "TARGET_AVX512FP16")
(V4SI "TARGET_SSE4_1")
(V2DI "TARGET_SSE4_1 && TARGET_64BIT")])
(define_mode_attr sse2p4_1
- [(V16QI "sse4_1") (V8HI "sse2")
+ [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse4_1")
(V4SI "sse4_1") (V2DI "sse4_1")])
(define_mode_attr pinsr_evex_isa
- [(V16QI "avx512bw") (V8HI "avx512bw")
+ [(V16QI "avx512bw") (V8HI "avx512bw") (V8HF "avx512bw")
(V4SI "avx512dq") (V2DI "avx512dq")])
;; sse4_1_pinsrd must come before sse2_loadld since it is preferred.
@@ -14676,11 +14799,19 @@ (define_insn "<sse2p4_1>_pinsr<ssemodesuffix>"
case 2:
case 4:
if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
- return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
+ {
+ if (<MODE>mode == V8HFmode)
+ return "vpinsrw\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
+ else
+ return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
+ }
/* FALLTHRU */
case 3:
case 5:
- return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+ if (<MODE>mode == V8HFmode)
+ return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+ else
+ return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
default:
gcc_unreachable ();
}
@@ -21095,16 +21226,17 @@ (define_mode_attr pbroadcast_evex_isa
[(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw")
(V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw")
(V16SI "avx512f") (V8SI "avx512f") (V4SI "avx512f")
- (V8DI "avx512f") (V4DI "avx512f") (V2DI "avx512f")])
+ (V8DI "avx512f") (V4DI "avx512f") (V2DI "avx512f")
+ (V32HF "avx512bw") (V16HF "avx512bw") (V8HF "avx512bw")])
(define_insn "avx2_pbroadcast<mode>"
- [(set (match_operand:VI 0 "register_operand" "=x,v")
- (vec_duplicate:VI
+ [(set (match_operand:VIHF 0 "register_operand" "=x,v")
+ (vec_duplicate:VIHF
(vec_select:<ssescalarmode>
(match_operand:<ssexmmmode> 1 "nonimmediate_operand" "xm,vm")
(parallel [(const_int 0)]))))]
"TARGET_AVX2"
- "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}"
+ "vpbroadcast<sseintmodesuffix>\t{%1, %0|%0, %<iptr>1}"
[(set_attr "isa" "*,<pbroadcast_evex_isa>")
(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
@@ -21112,17 +21244,17 @@ (define_insn "avx2_pbroadcast<mode>"
(set_attr "mode" "<sseinsnmode>")])
(define_insn "avx2_pbroadcast<mode>_1"
- [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")
- (vec_duplicate:VI_256
+ [(set (match_operand:VIHF_256 0 "register_operand" "=x,x,v,v")
+ (vec_duplicate:VIHF_256
(vec_select:<ssescalarmode>
- (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")
+ (match_operand:VIHF_256 1 "nonimmediate_operand" "m,x,m,v")
(parallel [(const_int 0)]))))]
"TARGET_AVX2"
"@
- vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
- vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}
- vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
- vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"
+ vpbroadcast<sseintmodesuffix>\t{%1, %0|%0, %<iptr>1}
+ vpbroadcast<sseintmodesuffix>\t{%x1, %0|%0, %x1}
+ vpbroadcast<sseintmodesuffix>\t{%1, %0|%0, %<iptr>1}
+ vpbroadcast<sseintmodesuffix>\t{%x1, %0|%0, %x1}"
[(set_attr "isa" "*,*,<pbroadcast_evex_isa>,<pbroadcast_evex_isa>")
(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
@@ -21476,15 +21608,15 @@ (define_insn "avx2_vec_dupv4df"
(set_attr "mode" "V4DF")])
(define_insn "<avx512>_vec_dup<mode>_1"
- [(set (match_operand:VI_AVX512BW 0 "register_operand" "=v,v")
- (vec_duplicate:VI_AVX512BW
+ [(set (match_operand:VIHF_AVX512BW 0 "register_operand" "=v,v")
+ (vec_duplicate:VIHF_AVX512BW
(vec_select:<ssescalarmode>
- (match_operand:VI_AVX512BW 1 "nonimmediate_operand" "v,m")
+ (match_operand:VIHF_AVX512BW 1 "nonimmediate_operand" "v,m")
(parallel [(const_int 0)]))))]
"TARGET_AVX512F"
"@
- vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}
- vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %<iptr>1}"
+ vpbroadcast<sseintmodesuffix>\t{%x1, %0|%0, %x1}
+ vpbroadcast<sseintmodesuffix>\t{%x1, %0|%0, %<iptr>1}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
@@ -21509,8 +21641,8 @@ (define_insn "<avx512>_vec_dup<mode><mask_name>"
(set_attr "mode" "<sseinsnmode>")])
(define_insn "<avx512>_vec_dup<mode><mask_name>"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
- (vec_duplicate:VI12_AVX512VL
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
+ (vec_duplicate:VI12HF_AVX512VL
(vec_select:<ssescalarmode>
(match_operand:<ssexmmmode> 1 "nonimmediate_operand" "vm")
(parallel [(const_int 0)]))))]
@@ -21545,8 +21677,8 @@ (define_insn "<mask_codefor>avx512f_broadcast<mode><mask_name>"
(set_attr "mode" "<sseinsnmode>")])
(define_insn "<mask_codefor><avx512>_vec_dup_gpr<mode><mask_name>"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
- (vec_duplicate:VI12_AVX512VL
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v,v")
+ (vec_duplicate:VI12HF_AVX512VL
(match_operand:<ssescalarmode> 1 "nonimmediate_operand" "vm,r")))]
"TARGET_AVX512BW"
"@
@@ -21641,7 +21773,7 @@ (define_mode_attr vecdupssescalarmodesuffix
[(V8SF "ss") (V4DF "sd") (V8SI "ss") (V4DI "sd")])
;; Modes handled by AVX2 vec_dup patterns.
(define_mode_iterator AVX2_VEC_DUP_MODE
- [V32QI V16QI V16HI V8HI V8SI V4SI])
+ [V32QI V16QI V16HI V8HI V8SI V4SI V16HF V8HF])
(define_insn "*vec_dup<mode>"
[(set (match_operand:AVX2_VEC_DUP_MODE 0 "register_operand" "=x,x,v")
@@ -22403,6 +22535,8 @@ (define_mode_iterator VEC_INIT_MODE
(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI
+ (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16")
+ (V8HF "TARGET_AVX512FP16")
(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")
(V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")])
@@ -22414,6 +22548,8 @@ (define_mode_iterator VEC_INIT_HALF_MODE
(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX")
+ (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16")
+ (V8HF "TARGET_AVX512FP16")
(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX")
(V4TI "TARGET_AVX512F")])
From: "H.J. Lu" <hjl.tools@gmail.com> gcc/ChangeLog: * config/i386/avx512fp16intrin.h (_mm_set_ph): New intrinsic. (_mm256_set_ph): Likewise. (_mm512_set_ph): Likewise. (_mm_setr_ph): Likewise. (_mm256_setr_ph): Likewise. (_mm512_setr_ph): Likewise. (_mm_set1_ph): Likewise. (_mm256_set1_ph): Likewise. (_mm512_set1_ph): Likewise. (_mm_setzero_ph): Likewise. (_mm256_setzero_ph): Likewise. (_mm512_setzero_ph): Likewise. (_mm_set_sh): Likewise. (_mm_load_sh): Likewise. (_mm_store_sh): Likewise. * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): Support vector HFmodes. (ix86_expand_vector_init_one_nonzero): Likewise. (ix86_expand_vector_init_one_var): Likewise. (ix86_expand_vector_init_interleave): Likewise. (ix86_expand_vector_init_general): Likewise. (ix86_expand_vector_set): Likewise. (ix86_expand_vector_extract): Likewise. * config/i386/i386-modes.def: Add HF vector modes in comment. * config/i386/i386.c (classify_argument): Add HF vector modes. (inline_secondary_memory_needed): Enable 16bit move. (ix86_hard_regno_mode_ok): Allow HF vector modes for AVX512FP16. (ix86_vector_mode_supported_p): Likewise. * config/i386/i386.md (mode): Add HF vector modes. (MODE_SIZE): Likewise. (ssemodesuffix): Add ph suffix for HF vector modes. * config/i386/sse.md (VMOVE): Adjust for HF vector modes. (V): Likewise. (V_256_512): Likewise. (avx512): Likewise. (shuffletype): Likewise. (sseinsnmode): Likewise. (ssedoublevecmode): Likewise. (ssehalfvecmode): Likewise. (ssehalfvecmodelower): Likewise. (ssePScmode): Likewise. (ssescalarmode): Likewise. (ssescalarmodelower): Likewise. (sseintprefix): Likewise. (i128): Likewise. (bcstscalarsuff): Likewise. (xtg_mode): Likewise. (VI12HF_AVX512VL): New mode_iterator. (VF_AVX512FP16): Likewise. (VIHF): Likewise. (VIHF_256): Likewise. (VIHF_AVX512BW): Likewise. (V16_256): Likewise. (V32_512): Likewise. (sseintmodesuffix): New mode_attr. (vec_set<mode>_0): New define_insn for HF vector set. (*avx512fp16_movsh): Likewise. (avx512fp16_movsh): Likewise. (vec_extract_lo_v32hi): Rename to ... (vec_extract_lo_<mode>): ... this, and adjust to allow HF vector modes. (vec_extract_hi_v32hi): Likewise. (vec_extract_hi_<mode>): Likewise. (vec_extract_lo_v16hi): Likewise. (vec_extract_lo_<mode>): Likewise. (vec_extract_hi_v16hi): Likewise. (vec_extract_hi_<mode>): Likewise. (*vec_extract<mode>_0): New define_insn_and_split for HF vector extract. (*vec_extracthf): New define_insn. (VEC_EXTRACT_MODE): Add HF vector modes. (PINSR_MODE): Add V8HF. (sse2p4_1): Likewise. (pinsr_evex_isa): Likewise. (<sse2p4_1>_pinsr<ssemodesuffix>): Adjust to support insert for V8HFmode. (pbroadcast_evex_isa): Add HF vector modes. (AVX2_VEC_DUP_MODE): Likewise. (VEC_INIT_MODE): Likewise. (VEC_INIT_HALF_MODE): Likewise. (avx2_pbroadcast<mode>): Adjust to support HF vector mode broadcast. (avx2_pbroadcast<mode>_1): Likewise. (<avx512>_vec_dup<mode>_1): Likewise. (<avx512>_vec_dup<mode><mask_name>): Likewise. (<mask_codefor><avx512>_vec_dup_gpr<mode><mask_name>): Likewise. --- gcc/config/i386/avx512fp16intrin.h | 172 +++++++++++++++++++ gcc/config/i386/i386-expand.c | 79 ++++++++- gcc/config/i386/i386-modes.def | 12 +- gcc/config/i386/i386.c | 19 ++- gcc/config/i386/i386.md | 13 +- gcc/config/i386/sse.md | 266 ++++++++++++++++++++++------- 6 files changed, 480 insertions(+), 81 deletions(-)