@@ -13723,6 +13723,19 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
}
goto widen;
+ case E_V2HImode:
+ if (TARGET_SSE2)
+ {
+ rtx x;
+
+ val = gen_lowpart (SImode, val);
+ x = gen_rtx_TRUNCATE (HImode, val);
+ x = gen_rtx_VEC_DUPLICATE (mode, x);
+ emit_insn (gen_rtx_SET (target, x));
+ return true;
+ }
+ return false;
+
case E_V8QImode:
if (!mmx_ok)
return false;
@@ -14524,6 +14537,8 @@ quarter:
case E_V4HImode:
case E_V8QImode:
+
+ case E_V2HImode:
break;
default:
@@ -14532,12 +14547,14 @@ quarter:
{
int i, j, n_elts, n_words, n_elt_per_word;
- machine_mode inner_mode;
+ machine_mode tmp_mode, inner_mode;
rtx words[4], shift;
+ tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
+
inner_mode = GET_MODE_INNER (mode);
n_elts = GET_MODE_NUNITS (mode);
- n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+ n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
n_elt_per_word = n_elts / n_words;
shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
@@ -14548,15 +14565,15 @@ quarter:
for (j = 0; j < n_elt_per_word; ++j)
{
rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
- elt = convert_modes (word_mode, inner_mode, elt, true);
+ elt = convert_modes (tmp_mode, inner_mode, elt, true);
if (j == 0)
word = elt;
else
{
- word = expand_simple_binop (word_mode, ASHIFT, word, shift,
+ word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
word, 1, OPTAB_LIB_WIDEN);
- word = expand_simple_binop (word_mode, IOR, word, elt,
+ word = expand_simple_binop (tmp_mode, IOR, word, elt,
word, 1, OPTAB_LIB_WIDEN);
}
}
@@ -14570,14 +14587,14 @@ quarter:
{
rtx tmp = gen_reg_rtx (mode);
emit_clobber (tmp);
- emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
- emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
+ emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
+ emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
emit_move_insn (target, tmp);
}
else if (n_words == 4)
{
rtx tmp = gen_reg_rtx (V4SImode);
- gcc_assert (word_mode == SImode);
+ gcc_assert (tmp_mode == SImode);
vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
emit_move_insn (target, gen_lowpart (mode, tmp));
@@ -19544,6 +19561,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
case E_V2DImode:
case E_V2SImode:
case E_V4SImode:
+ case E_V2HImode:
/* These are always directly implementable by expand_vec_perm_1. */
gcc_unreachable ();
@@ -19754,6 +19772,8 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
case E_V2DImode:
case E_V2SImode:
case E_V4SImode:
+ case E_V2HImode:
+ case E_V4HImode:
/* These are always implementable using standard shuffle patterns. */
gcc_unreachable ();
@@ -20263,6 +20283,10 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
if (!TARGET_MMX_WITH_SSE)
return false;
break;
+ case E_V2HImode:
+ if (!TARGET_SSE2)
+ return false;
+ break;
case E_V2DImode:
case E_V2DFmode:
if (!TARGET_SSE)
@@ -20294,10 +20318,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
/* Check whether the mask can be applied to the vector type. */
d.one_operand_p = (which != 3);
- /* Implementable with shufps or pshufd. */
+ /* Implementable with shufps, pshufd or pshuflw. */
if (d.one_operand_p
&& (d.vmode == V4SFmode || d.vmode == V2SFmode
- || d.vmode == V4SImode || d.vmode == V2SImode))
+ || d.vmode == V4SImode || d.vmode == V2SImode
+ || d.vmode == V4HImode || d.vmode == V2HImode))
return true;
/* Otherwise we have to go through the motions and see if we can
@@ -3292,6 +3292,88 @@ (define_expand "vec_extractv4qiqi"
DONE;
})
+(define_insn_and_split "*punpckwd"
+ [(set (match_operand:V2HI 0 "register_operand" "=x,Yw")
+ (vec_select:V2HI
+ (vec_concat:V4HI
+ (match_operand:V2HI 1 "register_operand" "0,Yw")
+ (match_operand:V2HI 2 "register_operand" "x,Yw"))
+ (parallel [(match_operand 3 "const_0_to_3_operand")
+ (match_operand 4 "const_0_to_3_operand")])))]
+ "TARGET_SSE2"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 5)
+ (vec_select:V4HI
+ (match_dup 5)
+ (parallel [(match_dup 3) (match_dup 4)
+ (const_int 0) (const_int 0)])))]
+{
+ rtx dest = lowpart_subreg (V8HImode, operands[0], V2HImode);
+ rtx op1 = lowpart_subreg (V8HImode, operands[1], V2HImode);
+ rtx op2 = lowpart_subreg (V8HImode, operands[2], V2HImode);
+
+ emit_insn (gen_vec_interleave_lowv8hi (dest, op1, op2));
+
+ static const int map[4] = { 0, 2, 1, 3 };
+
+ int sel0 = map[INTVAL (operands[3])];
+ int sel1 = map[INTVAL (operands[4])];
+
+ if (sel0 == 0 && sel1 == 1)
+ DONE;
+
+ operands[3] = GEN_INT (sel0);
+ operands[4] = GEN_INT (sel1);
+
+ operands[5] = lowpart_subreg (V4HImode, dest, V8HImode);
+}
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "sselog")
+ (set_attr "mode" "TI")])
+
+(define_insn "*pshufw_1"
+ [(set (match_operand:V2HI 0 "register_operand" "=Yw")
+ (vec_select:V2HI
+ (match_operand:V2HI 1 "register_operand" "Yw")
+ (parallel [(match_operand 2 "const_0_to_1_operand")
+ (match_operand 3 "const_0_to_1_operand")])))]
+ "TARGET_SSE2"
+{
+ int mask = 0;
+ mask |= INTVAL (operands[2]) << 0;
+ mask |= INTVAL (operands[3]) << 2;
+ mask |= 2 << 4;
+ mask |= 3 << 6;
+ operands[2] = GEN_INT (mask);
+
+ return "%vpshuflw\t{%2, %1, %0|%0, %1, %2}";
+}
+ [(set_attr "type" "sselog1")
+ (set_attr "length_immediate" "1")
+ (set_attr "mode" "TI")])
+
+(define_insn "*vec_dupv2hi"
+ [(set (match_operand:V2HI 0 "register_operand" "=Yw")
+ (vec_duplicate:V2HI
+ (truncate:HI
+ (match_operand:SI 1 "register_operand" "Yw"))))]
+ "TARGET_SSE2"
+ "%vpshuflw\t{$0, %1, %0|%0, %1, 0}"
+ [(set_attr "type" "sselog1")
+ (set_attr "length_immediate" "1")
+ (set_attr "mode" "TI")])
+
+(define_expand "vec_initv2hihi"
+ [(match_operand:V2HI 0 "register_operand")
+ (match_operand 1)]
+ "TARGET_SSE2"
+{
+ ix86_expand_vector_init (false, operands[0],
+ operands[1]);
+ DONE;
+})
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Miscellaneous
@@ -57,13 +57,13 @@ int main (int argc, const char* argv[])
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 2 "vect" { target { ! { vect_perm_short || vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_perm_short || vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 2 "vect" { target { ! { { vect_perm_short || vect32 } || vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_perm_short || vect32 } || vect_load_lanes } } } } */
/* We don't try permutes with a group size of 3 for variable-length
vectors. */
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target { vect_perm_short && { { ! vect_perm3_short } && { ! vect_partial_vectors_usage_1 } } } xfail vect_variable_length } } } */
/* Try to vectorize the epilogue using partial vectors. */
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 2 "vect" { target { vect_perm_short && { { ! vect_perm3_short } && vect_partial_vectors_usage_1 } } xfail vect_variable_length } } } */
/* { dg-final { scan-tree-dump-not "permutation requires at least three vectors" "vect" { target vect_perm3_short } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! vect_perm3_short } || vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_short && { ! vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! { vect_perm3_short || vect32 } } || vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_perm3_short || vect32 } && { ! vect_load_lanes } } } } } */