@@ -23106,68 +23106,6 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
gcc_assert (ok);
}
-/* This function is similar as ix86_expand_vecop_qihi,
- but optimized under AVX512BW by using vpmovwb.
- For example, optimize vector MUL generation like
-
- vpmovzxbw ymm2, xmm0
- vpmovzxbw ymm3, xmm1
- vpmullw ymm4, ymm2, ymm3
- vpmovwb xmm0, ymm4
-
- it would take less instructions than ix86_expand_vecop_qihi.
- Return true if success. */
-
-static bool
-ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
- machine_mode himode, qimode = GET_MODE (dest);
- rtx hop1, hop2, hdest;
- rtx (*gen_truncate)(rtx, rtx);
- bool uns_p = (code == ASHIFTRT) ? false : true;
-
- /* There are no V64HImode instructions. */
- if (qimode == V64QImode)
- return false;
-
- /* vpmovwb only available under AVX512BW. */
- if (!TARGET_AVX512BW)
- return false;
-
- if (qimode == V16QImode && !TARGET_AVX512VL)
- return false;
-
- /* Do not generate ymm/zmm instructions when
- target prefers 128/256 bit vector width. */
- if ((qimode == V16QImode && TARGET_PREFER_AVX128)
- || (qimode == V32QImode && TARGET_PREFER_AVX256))
- return false;
-
- switch (qimode)
- {
- case E_V16QImode:
- himode = V16HImode;
- gen_truncate = gen_truncv16hiv16qi2;
- break;
- case E_V32QImode:
- himode = V32HImode;
- gen_truncate = gen_truncv32hiv32qi2;
- break;
- default:
- gcc_unreachable ();
- }
-
- hop1 = gen_reg_rtx (himode);
- hop2 = gen_reg_rtx (himode);
- hdest = gen_reg_rtx (himode);
- emit_insn (gen_extend_insn (hop1, op1, himode, qimode, uns_p));
- emit_insn (gen_extend_insn (hop2, op2, himode, qimode, uns_p));
- emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
- hop1, hop2)));
- emit_insn (gen_truncate (dest, hdest));
- return true;
-}
-
/* Expand a vector operation shift by constant for a V*QImode in terms of the
same operation on V*HImode. Return true if success. */
static bool
@@ -23272,9 +23210,9 @@ void
ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
machine_mode qimode = GET_MODE (dest);
- rtx qop1, qop2, hop1, hop2, qdest, hres;
+ rtx qop1, qop2, hop1, hop2, qdest, hdest;
bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
- bool uns_p = true;
+ bool uns_p = code != ASHIFTRT;
switch (qimode)
{
@@ -23306,24 +23244,25 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
case MULT:
gcc_assert (op2vec);
- /* Unpack data such that we've got a source byte in each low byte of
- each word. We don't care what goes into the high byte of each word.
- Rather than trying to get zero in there, most convenient is to let
- it be a copy of the low byte. */
- hop1 = copy_to_reg (qop1);
- hop2 = copy_to_reg (qop2);
- emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
- emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
- break;
-
- case ASHIFTRT:
- uns_p = false;
+ if (!TARGET_SSE4_1)
+ {
+ /* Unpack data such that we've got a source byte in each low byte
+ of each word. We don't care what goes into the high byte of
+ each word. Rather than trying to get zero in there, most
+ convenient is to let it be a copy of the low byte. */
+ hop1 = copy_to_reg (qop1);
+ hop2 = copy_to_reg (qop2);
+ emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+ emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+ break;
+ }
/* FALLTHRU */
case ASHIFT:
+ case ASHIFTRT:
case LSHIFTRT:
hop1 = gen_reg_rtx (V8HImode);
ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
- /* vashr/vlshr/vashl */
+ /* mult/vashr/vlshr/vashl */
if (op2vec)
{
hop2 = gen_reg_rtx (V8HImode);
@@ -23340,14 +23279,14 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
if (code != MULT && op2vec)
{
/* Expand vashr/vlshr/vashl. */
- hres = gen_reg_rtx (V8HImode);
- emit_insn (gen_rtx_SET (hres,
+ hdest = gen_reg_rtx (V8HImode);
+ emit_insn (gen_rtx_SET (hdest,
simplify_gen_binary (code, V8HImode,
hop1, hop2)));
}
else
/* Expand mult/ashr/lshr/ashl. */
- hres = expand_simple_binop (V8HImode, code, hop1, hop2,
+ hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
NULL_RTX, 1, OPTAB_DIRECT);
if (TARGET_AVX512BW && TARGET_AVX512VL)
@@ -23357,19 +23296,18 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
else
qdest = gen_reg_rtx (V8QImode);
- emit_insn (gen_truncv8hiv8qi2 (qdest, hres));
+ emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
}
else
{
struct expand_vec_perm_d d;
- rtx qres = gen_lowpart (V16QImode, hres);
+ rtx qres = gen_lowpart (V16QImode, hdest);
bool ok;
int i;
/* Merge the data back into the right place. */
d.target = qdest;
- d.op0 = qres;
- d.op1 = qres;
+ d.op0 = d.op1 = qres;
d.vmode = V16QImode;
d.nelt = 16;
d.one_operand_p = false;
@@ -23386,6 +23324,116 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
emit_move_insn (dest, gen_lowpart (qimode, qdest));
}
+/* Emit instruction in 2x wider mode. For example, optimize
+ vector MUL generation like
+
+ vpmovzxbw ymm2, xmm0
+ vpmovzxbw ymm3, xmm1
+ vpmullw ymm4, ymm2, ymm3
+ vpmovwb xmm0, ymm4
+
+ it would take less instructions than ix86_expand_vecop_qihi.
+ Return true if success. */
+
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode himode, qimode = GET_MODE (dest);
+ machine_mode wqimode;
+ rtx qop1, qop2, hop1, hop2, hdest;
+ rtx (*gen_truncate)(rtx, rtx) = NULL;
+ bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+ bool uns_p = code != ASHIFTRT;
+
+ if ((qimode == V16QImode && !TARGET_AVX2)
+ || (qimode == V32QImode && !TARGET_AVX512BW)
+ /* There are no V64HImode instructions. */
+ || qimode == V64QImode)
+ return false;
+
+ /* Do not generate ymm/zmm instructions when
+ target prefers 128/256 bit vector width. */
+ if ((qimode == V16QImode && TARGET_PREFER_AVX128)
+ || (qimode == V32QImode && TARGET_PREFER_AVX256))
+ return false;
+
+ switch (qimode)
+ {
+ case E_V16QImode:
+ himode = V16HImode;
+ if (TARGET_AVX512VL)
+ gen_truncate = gen_truncv16hiv16qi2;
+ break;
+ case E_V32QImode:
+ himode = V32HImode;
+ gen_truncate = gen_truncv32hiv32qi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
+ qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
+
+ if (op2vec)
+ qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
+ else
+ qop2 = op2;
+
+ hop1 = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+
+ if (op2vec)
+ {
+ hop2 = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+ }
+ else
+ hop2 = qop2;
+
+ if (code != MULT && op2vec)
+ {
+ /* Expand vashr/vlshr/vashl. */
+ hdest = gen_reg_rtx (himode);
+ emit_insn (gen_rtx_SET (hdest,
+ simplify_gen_binary (code, himode,
+ hop1, hop2)));
+ }
+ else
+ /* Expand mult/ashr/lshr/ashl. */
+ hdest = expand_simple_binop (himode, code, hop1, hop2,
+ NULL_RTX, 1, OPTAB_DIRECT);
+
+ if (gen_truncate)
+ emit_insn (gen_truncate (dest, hdest));
+ else
+ {
+ struct expand_vec_perm_d d;
+ rtx wqdest = gen_reg_rtx (wqimode);
+ rtx wqres = gen_lowpart (wqimode, hdest);
+ bool ok;
+ int i;
+
+ /* Merge the data back into the right place. */
+ d.target = wqdest;
+ d.op0 = d.op1 = wqres;
+ d.vmode = wqimode;
+ d.nelt = GET_MODE_NUNITS (wqimode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = i * 2;
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+
+ emit_move_insn (dest, gen_lowpart (qimode, wqdest));
+ }
+
+ return true;
+}
+
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
@@ -23400,7 +23448,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
struct expand_vec_perm_d d;
bool full_interleave = true;
- bool uns_p = true;
+ bool uns_p = code != ASHIFTRT;
bool ok;
int i;
@@ -23409,9 +23457,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
&& ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
return;
- if (TARGET_AVX512BW
- && VECTOR_MODE_P (GET_MODE (op2))
- && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+ if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
return;
switch (qimode)
@@ -23468,10 +23514,8 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
emit_insn (gen_ih (op1_h, op1, op1));
break;
- case ASHIFTRT:
- uns_p = false;
- /* FALLTHRU */
case ASHIFT:
+ case ASHIFTRT:
case LSHIFTRT:
op1_l = gen_reg_rtx (himode);
op1_h = gen_reg_rtx (himode);
@@ -20497,75 +20497,109 @@ ix86_multiplication_cost (const struct processor_costs *cost,
return ix86_vec_cost (mode,
inner_mode == DFmode ? cost->mulsd : cost->mulss);
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- switch (mode)
- {
- case V4QImode:
- case V8QImode:
- /* Partial V*QImode is emulated with 4-6 insns. */
- if (TARGET_AVX512BW && TARGET_AVX512VL)
- return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3);
- else if (TARGET_AVX2)
- return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 5);
- else if (TARGET_XOP)
- return (ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3)
- + cost->sse_load[2]);
- else
- return (ix86_vec_cost (mode, cost->mulss + cost->sse_op * 4)
- + cost->sse_load[2]);
-
- case V16QImode:
- /* V*QImode is emulated with 4-11 insns. */
- if (TARGET_AVX512BW && TARGET_AVX512VL)
- return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3);
- else if (TARGET_AVX2)
- return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 8);
- else if (TARGET_XOP)
- return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5)
- + cost->sse_load[2]);
- else
- return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 7)
- + cost->sse_load[2]);
+ {
+ int nmults, nops;
+ /* Cost of reading the memory. */
+ int extra;
- case V32QImode:
- if (TARGET_AVX512BW)
- return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3);
- else
- return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 7)
- + cost->sse_load[3] * 2);
-
- case V64QImode:
- return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 9)
- + cost->sse_load[3] * 2
- + cost->sse_load[4] * 2);
-
- case V4SImode:
- /* pmulld is used in this case. No emulation is needed. */
- if (TARGET_SSE4_1)
- goto do_native;
- /* V4SImode is emulated with 7 insns. */
- else
- return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
-
- case V2DImode:
- case V4DImode:
- /* vpmullq is used in this case. No emulation is needed. */
- if (TARGET_AVX512DQ && TARGET_AVX512VL)
- goto do_native;
- /* V*DImode is emulated with 6-8 insns. */
- else if (TARGET_XOP && mode == V2DImode)
- return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 4);
- /* FALLTHRU */
- case V8DImode:
- /* vpmullq is used in this case. No emulation is needed. */
- if (TARGET_AVX512DQ && mode == V8DImode)
- goto do_native;
- else
- return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
+ switch (mode)
+ {
+ case V4QImode:
+ case V8QImode:
+ /* Partial V*QImode is emulated with 4-6 insns. */
+ nmults = 1;
+ nops = 3;
+ extra = 0;
- default:
- do_native:
- return ix86_vec_cost (mode, cost->mulss);
- }
+ if (TARGET_AVX512BW && TARGET_AVX512VL)
+ ;
+ else if (TARGET_AVX2)
+ nops += 2;
+ else if (TARGET_XOP)
+ extra += cost->sse_load[2];
+ else
+ {
+ nops += 1;
+ extra += cost->sse_load[2];
+ }
+ goto do_qimode;
+
+ case V16QImode:
+ /* V*QImode is emulated with 4-11 insns. */
+ nmults = 1;
+ nops = 3;
+ extra = 0;
+
+ if (TARGET_AVX2 && !TARGET_PREFER_AVX128)
+ {
+ if (!(TARGET_AVX512BW && TARGET_AVX512VL))
+ nops += 3;
+ }
+ else if (TARGET_XOP)
+ {
+ nmults += 1;
+ nops += 2;
+ extra += cost->sse_load[2];
+ }
+ else
+ {
+ nmults += 1;
+ nops += 4;
+ extra += cost->sse_load[2];
+ }
+ goto do_qimode;
+
+ case V32QImode:
+ nmults = 1;
+ nops = 3;
+ extra = 0;
+
+ if (!TARGET_AVX512BW || TARGET_PREFER_AVX256)
+ {
+ nmults += 1;
+ nops += 4;
+ extra += cost->sse_load[3] * 2;
+ }
+ goto do_qimode;
+
+ case V64QImode:
+ nmults = 2;
+ nops = 9;
+ extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2;
+
+ do_qimode:
+ return ix86_vec_cost (mode, cost->mulss * nmults
+ + cost->sse_op * nops) + extra;
+
+ case V4SImode:
+ /* pmulld is used in this case. No emulation is needed. */
+ if (TARGET_SSE4_1)
+ goto do_native;
+ /* V4SImode is emulated with 7 insns. */
+ else
+ return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
+
+ case V2DImode:
+ case V4DImode:
+ /* vpmullq is used in this case. No emulation is needed. */
+ if (TARGET_AVX512DQ && TARGET_AVX512VL)
+ goto do_native;
+ /* V*DImode is emulated with 6-8 insns. */
+ else if (TARGET_XOP && mode == V2DImode)
+ return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 4);
+ /* FALLTHRU */
+ case V8DImode:
+ /* vpmullq is used in this case. No emulation is needed. */
+ if (TARGET_AVX512DQ && mode == V8DImode)
+ goto do_native;
+ else
+ return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
+
+ default:
+ do_native:
+ return ix86_vec_cost (mode, cost->mulss);
+ }
+ }
else
return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
}
@@ -20637,16 +20671,13 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
count = 2;
}
else if (TARGET_AVX512BW && TARGET_AVX512VL)
- {
- count = 3;
- return ix86_vec_cost (mode, cost->sse_op * count);
- }
+ return ix86_vec_cost (mode, cost->sse_op * 4);
else if (TARGET_SSE4_1)
- count = 4;
- else if (code == ASHIFTRT)
count = 5;
+ else if (code == ASHIFTRT)
+ count = 6;
else
- count = 4;
+ count = 5;
return ix86_vec_cost (mode, cost->sse_op * count) + extra;
case V16QImode:
@@ -20663,7 +20694,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
}
else
{
- count = (code == ASHIFT) ? 2 : 3;
+ count = (code == ASHIFT) ? 3 : 4;
return ix86_vec_cost (mode, cost->sse_op * count);
}
}
@@ -20685,12 +20716,20 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
else
count = 2;
}
+ else if (TARGET_AVX512BW
+ && ((mode == V32QImode && !TARGET_PREFER_AVX256)
+ || (mode == V16QImode && TARGET_AVX512VL
+ && !TARGET_PREFER_AVX128)))
+ return ix86_vec_cost (mode, cost->sse_op * 4);
+ else if (TARGET_AVX2
+ && mode == V16QImode && !TARGET_PREFER_AVX128)
+ count = 6;
else if (TARGET_SSE4_1)
- count = 8;
- else if (code == ASHIFTRT)
count = 9;
+ else if (code == ASHIFTRT)
+ count = 10;
else
- count = 8;
+ count = 9;
return ix86_vec_cost (mode, cost->sse_op * count) + extra;
case V2DImode:
@@ -20704,6 +20743,8 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
count = TARGET_SSE4_2 ? 1 : 2;
else if (TARGET_XOP)
count = 2;
+ else if (TARGET_SSE4_1)
+ count = 3;
else
count = 4;
}
@@ -1,8 +1,7 @@
/* PR target/pr95488 */
/* { dg-do compile } */
/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
-/* { dg-final { scan-assembler-times "vpmovzxbw" 4 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpunpcklbw" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 8 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */
/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vpmovwb" 4 { target { ! ia32 } } } } */