@@ -192,6 +192,8 @@ extern void ix86_expand_rounddf_32 (rtx, rtx);
extern void ix86_expand_trunc (rtx, rtx);
extern void ix86_expand_truncdf_32 (rtx, rtx);
+extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
+
#ifdef TREE_CODE
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
#endif /* TREE_CODE */
@@ -38438,6 +38438,91 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
expand_vec_perm_even_odd_1 (&d, odd);
}
+/* Expand a vector operation CODE for a V*QImode in terms of the
+ same operation on V*HImode. */
+
+void
+ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ enum machine_mode qimode = GET_MODE (dest);
+ enum machine_mode himode;
+ rtx (*gen_il) (rtx, rtx, rtx);
+ rtx (*gen_ih) (rtx, rtx, rtx);
+ rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+ struct expand_vec_perm_d d;
+ bool ok;
+ int i;
+
+ if (qimode == V16QImode)
+ {
+ himode = V8HImode;
+ gen_il = gen_vec_interleave_lowv16qi;
+ gen_ih = gen_vec_interleave_highv16qi;
+ }
+ else if (qimode == V32QImode)
+ {
+ himode = V16HImode;
+ gen_il = gen_avx2_interleave_lowv32qi;
+ gen_ih = gen_avx2_interleave_highv32qi;
+ }
+ else
+ gcc_unreachable ();
+
+ /* Unpack data such that we've got a source byte in each low byte of
+ each word. We don't care what goes into the high byte of each word.
+ Rather than trying to get zero in there, most convenient is to let
+ it be a copy of the low byte. */
+ op1_l = gen_reg_rtx (qimode);
+ op1_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op1_l, op1, op1));
+ emit_insn (gen_ih (op1_h, op1, op1));
+
+ op2_l = gen_reg_rtx (qimode);
+ op2_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op2_l, op2, op2));
+ emit_insn (gen_ih (op2_h, op2, op2));
+
+ /* Perform the operation. */
+ res_l = expand_simple_binop (himode, code, gen_lowpart (himode, op1_l),
+ gen_lowpart (himode, op2_l), NULL_RTX,
+ 1, OPTAB_DIRECT);
+ res_h = expand_simple_binop (himode, code, gen_lowpart (himode, op1_h),
+ gen_lowpart (himode, op2_h), NULL_RTX,
+ 1, OPTAB_DIRECT);
+ gcc_assert (res_l && res_h);
+
+ /* Merge the data back into the right place. */
+ d.target = dest;
+ d.op0 = gen_lowpart (qimode, res_l);
+ d.op1 = gen_lowpart (qimode, res_h);
+ d.vmode = qimode;
+ d.nelt = GET_MODE_NUNITS (qimode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ if (qimode == V16QImode)
+ {
+ /* For SSE2, we used an full interleave, so the desired
+ results are in the even elements. */
+ for (i = 0; i < 16; ++i)
+ d.perm[i] = i * 2;
+ }
+ else
+ {
+ /* For AVX, the interleave used above was not cross-lane. So the
+ extraction is evens but with the second and third quarter swapped.
+ Happily, that is even one insn shorter than even extraction. */
+ for (i = 0; i < 32; ++i)
+ d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
+ }
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+
+ set_unique_reg_note (get_last_insn (), REG_EQUAL,
+ gen_rtx_fmt_ee (code, qimode, op1, op2));
+}
+
void
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
{
@@ -5213,70 +5213,13 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "TI")])
-(define_insn_and_split "mul<mode>3"
+(define_expand "mul<mode>3"
[(set (match_operand:VI1_AVX2 0 "register_operand")
(mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand")
(match_operand:VI1_AVX2 2 "register_operand")))]
- "TARGET_SSE2
- && can_create_pseudo_p ()"
- "#"
- "&& 1"
- [(const_int 0)]
+ "TARGET_SSE2"
{
- rtx t[6];
- int i;
- enum machine_mode mulmode = <sseunpackmode>mode;
-
- for (i = 0; i < 6; ++i)
- t[i] = gen_reg_rtx (<MODE>mode);
-
- /* Unpack data such that we've got a source byte in each low byte of
- each word. We don't care what goes into the high byte of each word.
- Rather than trying to get zero in there, most convenient is to let
- it be a copy of the low byte. */
- emit_insn (gen_<vec_avx2>_interleave_high<mode> (t[0], operands[1],
- operands[1]));
- emit_insn (gen_<vec_avx2>_interleave_high<mode> (t[1], operands[2],
- operands[2]));
- emit_insn (gen_<vec_avx2>_interleave_low<mode> (t[2], operands[1],
- operands[1]));
- emit_insn (gen_<vec_avx2>_interleave_low<mode> (t[3], operands[2],
- operands[2]));
-
- /* Multiply words. The end-of-line annotations here give a picture of what
- the output of that instruction looks like. Dot means don't care; the
- letters are the bytes of the result with A being the most significant. */
- emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (mulmode, t[4]),
- gen_rtx_MULT (mulmode, /* .A.B.C.D.E.F.G.H */
- gen_lowpart (mulmode, t[0]),
- gen_lowpart (mulmode, t[1]))));
- emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (mulmode, t[5]),
- gen_rtx_MULT (mulmode, /* .I.J.K.L.M.N.O.P */
- gen_lowpart (mulmode, t[2]),
- gen_lowpart (mulmode, t[3]))));
-
- /* Extract the even bytes and merge them back together. */
- if (<MODE>mode == V16QImode)
- ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0);
- else
- {
- /* Since avx2_interleave_{low,high}v32qi used above aren't cross-lane,
- this can't be normal even extraction, but one where additionally
- the second and third quarter are swapped. That is even one insn
- shorter than even extraction. */
- rtvec v = rtvec_alloc (32);
- for (i = 0; i < 32; ++i)
- RTVEC_ELT (v, i)
- = GEN_INT (i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0));
- t[0] = operands[0];
- t[1] = t[5];
- t[2] = t[4];
- t[3] = gen_rtx_CONST_VECTOR (<MODE>mode, v);
- ix86_expand_vec_perm_const (t);
- }
-
- set_unique_reg_note (get_last_insn (), REG_EQUAL,
- gen_rtx_MULT (<MODE>mode, operands[1], operands[2]));
+ ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
DONE;
})