[COMMITTED] i386: Use 2x-wider modes when emulating QImode vector instructions

Message ID	CAFULd4bYim7cFJYrQrPZ3+n_HmqEyYsKejmy=fjaaRHcdD+JXg@mail.gmail.com
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 84E8B3858D32 MIME-Version: 1.0 Date: Thu, 25 May 2023 19:43:46 +0200 Message-ID: <CAFULd4bYim7cFJYrQrPZ3+n_HmqEyYsKejmy=fjaaRHcdD+JXg@mail.gmail.com> Subject: [COMMITTED] i386: Use 2x-wider modes when emulating QImode vector instructions To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org> Content-Type: multipart/mixed; boundary="000000000000348a4e05fc882a67" Precedence: list From: Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Uros Bizjak <ubizjak@gmail.com> Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org>
Series	[COMMITTED] i386: Use 2x-wider modes when emulating QImode vector instructions \| expand [COMMITTED] i386: Use 2x-wider modes when emulating QImode vector instructions

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 5a57be82e98..0d8953b8c75 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -23106,68 +23106,6 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) gcc_assert (ok); } -/* This function is similar as ix86_expand_vecop_qihi, - but optimized under AVX512BW by using vpmovwb. - For example, optimize vector MUL generation like - - vpmovzxbw ymm2, xmm0 - vpmovzxbw ymm3, xmm1 - vpmullw ymm4, ymm2, ymm3 - vpmovwb xmm0, ymm4 - - it would take less instructions than ix86_expand_vecop_qihi. - Return true if success. */ - -static bool -ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2) -{ - machine_mode himode, qimode = GET_MODE (dest); - rtx hop1, hop2, hdest; - rtx (*gen_truncate)(rtx, rtx); - bool uns_p = (code == ASHIFTRT) ? false : true; - - /* There are no V64HImode instructions. */ - if (qimode == V64QImode) - return false; - - /* vpmovwb only available under AVX512BW. */ - if (!TARGET_AVX512BW) - return false; - - if (qimode == V16QImode && !TARGET_AVX512VL) - return false; - - /* Do not generate ymm/zmm instructions when - target prefers 128/256 bit vector width. */ - if ((qimode == V16QImode && TARGET_PREFER_AVX128) - || (qimode == V32QImode && TARGET_PREFER_AVX256)) - return false; - - switch (qimode) - { - case E_V16QImode: - himode = V16HImode; - gen_truncate = gen_truncv16hiv16qi2; - break; - case E_V32QImode: - himode = V32HImode; - gen_truncate = gen_truncv32hiv32qi2; - break; - default: - gcc_unreachable (); - } - - hop1 = gen_reg_rtx (himode); - hop2 = gen_reg_rtx (himode); - hdest = gen_reg_rtx (himode); - emit_insn (gen_extend_insn (hop1, op1, himode, qimode, uns_p)); - emit_insn (gen_extend_insn (hop2, op2, himode, qimode, uns_p)); - emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode, - hop1, hop2))); - emit_insn (gen_truncate (dest, hdest)); - return true; -} - /* Expand a vector operation shift by constant for a V*QImode in terms of the same operation on V*HImode. Return true if success. */ static bool @@ -23272,9 +23210,9 @@ void ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) { machine_mode qimode = GET_MODE (dest); - rtx qop1, qop2, hop1, hop2, qdest, hres; + rtx qop1, qop2, hop1, hop2, qdest, hdest; bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT; - bool uns_p = true; + bool uns_p = code != ASHIFTRT; switch (qimode) { @@ -23306,24 +23244,25 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) { case MULT: gcc_assert (op2vec); - /* Unpack data such that we've got a source byte in each low byte of - each word. We don't care what goes into the high byte of each word. - Rather than trying to get zero in there, most convenient is to let - it be a copy of the low byte. */ - hop1 = copy_to_reg (qop1); - hop2 = copy_to_reg (qop2); - emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1)); - emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2)); - break; - - case ASHIFTRT: - uns_p = false; + if (!TARGET_SSE4_1) + { + /* Unpack data such that we've got a source byte in each low byte + of each word. We don't care what goes into the high byte of + each word. Rather than trying to get zero in there, most + convenient is to let it be a copy of the low byte. */ + hop1 = copy_to_reg (qop1); + hop2 = copy_to_reg (qop2); + emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1)); + emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2)); + break; + } /* FALLTHRU */ case ASHIFT: + case ASHIFTRT: case LSHIFTRT: hop1 = gen_reg_rtx (V8HImode); ix86_expand_sse_unpack (hop1, qop1, uns_p, false); - /* vashr/vlshr/vashl */ + /* mult/vashr/vlshr/vashl */ if (op2vec) { hop2 = gen_reg_rtx (V8HImode); @@ -23340,14 +23279,14 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) if (code != MULT && op2vec) { /* Expand vashr/vlshr/vashl. */ - hres = gen_reg_rtx (V8HImode); - emit_insn (gen_rtx_SET (hres, + hdest = gen_reg_rtx (V8HImode); + emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, V8HImode, hop1, hop2))); } else /* Expand mult/ashr/lshr/ashl. */ - hres = expand_simple_binop (V8HImode, code, hop1, hop2, + hdest = expand_simple_binop (V8HImode, code, hop1, hop2, NULL_RTX, 1, OPTAB_DIRECT); if (TARGET_AVX512BW && TARGET_AVX512VL) @@ -23357,19 +23296,18 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) else qdest = gen_reg_rtx (V8QImode); - emit_insn (gen_truncv8hiv8qi2 (qdest, hres)); + emit_insn (gen_truncv8hiv8qi2 (qdest, hdest)); } else { struct expand_vec_perm_d d; - rtx qres = gen_lowpart (V16QImode, hres); + rtx qres = gen_lowpart (V16QImode, hdest); bool ok; int i; /* Merge the data back into the right place. */ d.target = qdest; - d.op0 = qres; - d.op1 = qres; + d.op0 = d.op1 = qres; d.vmode = V16QImode; d.nelt = 16; d.one_operand_p = false; @@ -23386,6 +23324,116 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) emit_move_insn (dest, gen_lowpart (qimode, qdest)); } +/* Emit instruction in 2x wider mode. For example, optimize + vector MUL generation like + + vpmovzxbw ymm2, xmm0 + vpmovzxbw ymm3, xmm1 + vpmullw ymm4, ymm2, ymm3 + vpmovwb xmm0, ymm4 + + it would take less instructions than ix86_expand_vecop_qihi. + Return true if success. */ + +static bool +ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2) +{ + machine_mode himode, qimode = GET_MODE (dest); + machine_mode wqimode; + rtx qop1, qop2, hop1, hop2, hdest; + rtx (*gen_truncate)(rtx, rtx) = NULL; + bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT; + bool uns_p = code != ASHIFTRT; + + if ((qimode == V16QImode && !TARGET_AVX2) + || (qimode == V32QImode && !TARGET_AVX512BW) + /* There are no V64HImode instructions. */ + || qimode == V64QImode) + return false; + + /* Do not generate ymm/zmm instructions when + target prefers 128/256 bit vector width. */ + if ((qimode == V16QImode && TARGET_PREFER_AVX128) + || (qimode == V32QImode && TARGET_PREFER_AVX256)) + return false; + + switch (qimode) + { + case E_V16QImode: + himode = V16HImode; + if (TARGET_AVX512VL) + gen_truncate = gen_truncv16hiv16qi2; + break; + case E_V32QImode: + himode = V32HImode; + gen_truncate = gen_truncv32hiv32qi2; + break; + default: + gcc_unreachable (); + } + + wqimode = GET_MODE_2XWIDER_MODE (qimode).require (); + qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode); + + if (op2vec) + qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode); + else + qop2 = op2; + + hop1 = gen_reg_rtx (himode); + ix86_expand_sse_unpack (hop1, qop1, uns_p, false); + + if (op2vec) + { + hop2 = gen_reg_rtx (himode); + ix86_expand_sse_unpack (hop2, qop2, uns_p, false); + } + else + hop2 = qop2; + + if (code != MULT && op2vec) + { + /* Expand vashr/vlshr/vashl. */ + hdest = gen_reg_rtx (himode); + emit_insn (gen_rtx_SET (hdest, + simplify_gen_binary (code, himode, + hop1, hop2))); + } + else + /* Expand mult/ashr/lshr/ashl. */ + hdest = expand_simple_binop (himode, code, hop1, hop2, + NULL_RTX, 1, OPTAB_DIRECT); + + if (gen_truncate) + emit_insn (gen_truncate (dest, hdest)); + else + { + struct expand_vec_perm_d d; + rtx wqdest = gen_reg_rtx (wqimode); + rtx wqres = gen_lowpart (wqimode, hdest); + bool ok; + int i; + + /* Merge the data back into the right place. */ + d.target = wqdest; + d.op0 = d.op1 = wqres; + d.vmode = wqimode; + d.nelt = GET_MODE_NUNITS (wqimode); + d.one_operand_p = false; + d.testing_p = false; + + for (i = 0; i < d.nelt; ++i) + d.perm[i] = i * 2; + + ok = ix86_expand_vec_perm_const_1 (&d); + gcc_assert (ok); + + emit_move_insn (dest, gen_lowpart (qimode, wqdest)); + } + + return true; +} + /* Expand a vector operation CODE for a V*QImode in terms of the same operation on V*HImode. */ @@ -23400,7 +23448,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT; struct expand_vec_perm_d d; bool full_interleave = true; - bool uns_p = true; + bool uns_p = code != ASHIFTRT; bool ok; int i; @@ -23409,9 +23457,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2)) return; - if (TARGET_AVX512BW - && VECTOR_MODE_P (GET_MODE (op2)) - && ix86_expand_vecop_qihi2 (code, dest, op1, op2)) + if (ix86_expand_vecop_qihi2 (code, dest, op1, op2)) return; switch (qimode) @@ -23468,10 +23514,8 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) emit_insn (gen_ih (op1_h, op1, op1)); break; - case ASHIFTRT: - uns_p = false; - /* FALLTHRU */ case ASHIFT: + case ASHIFTRT: case LSHIFTRT: op1_l = gen_reg_rtx (himode); op1_h = gen_reg_rtx (himode); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 202abf0b39c..e548d658828 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -20497,75 +20497,109 @@ ix86_multiplication_cost (const struct processor_costs *cost, return ix86_vec_cost (mode, inner_mode == DFmode ? cost->mulsd : cost->mulss); else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - switch (mode) - { - case V4QImode: - case V8QImode: - /* Partial V*QImode is emulated with 4-6 insns. */ - if (TARGET_AVX512BW && TARGET_AVX512VL) - return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3); - else if (TARGET_AVX2) - return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 5); - else if (TARGET_XOP) - return (ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3) - + cost->sse_load[2]); - else - return (ix86_vec_cost (mode, cost->mulss + cost->sse_op * 4) - + cost->sse_load[2]); - - case V16QImode: - /* V*QImode is emulated with 4-11 insns. */ - if (TARGET_AVX512BW && TARGET_AVX512VL) - return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3); - else if (TARGET_AVX2) - return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 8); - else if (TARGET_XOP) - return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5) - + cost->sse_load[2]); - else - return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 7) - + cost->sse_load[2]); + { + int nmults, nops; + /* Cost of reading the memory. */ + int extra; - case V32QImode: - if (TARGET_AVX512BW) - return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3); - else - return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 7) - + cost->sse_load[3] * 2); - - case V64QImode: - return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 9) - + cost->sse_load[3] * 2 - + cost->sse_load[4] * 2); - - case V4SImode: - /* pmulld is used in this case. No emulation is needed. */ - if (TARGET_SSE4_1) - goto do_native; - /* V4SImode is emulated with 7 insns. */ - else - return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5); - - case V2DImode: - case V4DImode: - /* vpmullq is used in this case. No emulation is needed. */ - if (TARGET_AVX512DQ && TARGET_AVX512VL) - goto do_native; - /* V*DImode is emulated with 6-8 insns. */ - else if (TARGET_XOP && mode == V2DImode) - return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 4); - /* FALLTHRU */ - case V8DImode: - /* vpmullq is used in this case. No emulation is needed. */ - if (TARGET_AVX512DQ && mode == V8DImode) - goto do_native; - else - return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5); + switch (mode) + { + case V4QImode: + case V8QImode: + /* Partial V*QImode is emulated with 4-6 insns. */ + nmults = 1; + nops = 3; + extra = 0; - default: - do_native: - return ix86_vec_cost (mode, cost->mulss); - } + if (TARGET_AVX512BW && TARGET_AVX512VL) + ; + else if (TARGET_AVX2) + nops += 2; + else if (TARGET_XOP) + extra += cost->sse_load[2]; + else + { + nops += 1; + extra += cost->sse_load[2]; + } + goto do_qimode; + + case V16QImode: + /* V*QImode is emulated with 4-11 insns. */ + nmults = 1; + nops = 3; + extra = 0; + + if (TARGET_AVX2 && !TARGET_PREFER_AVX128) + { + if (!(TARGET_AVX512BW && TARGET_AVX512VL)) + nops += 3; + } + else if (TARGET_XOP) + { + nmults += 1; + nops += 2; + extra += cost->sse_load[2]; + } + else + { + nmults += 1; + nops += 4; + extra += cost->sse_load[2]; + } + goto do_qimode; + + case V32QImode: + nmults = 1; + nops = 3; + extra = 0; + + if (!TARGET_AVX512BW || TARGET_PREFER_AVX256) + { + nmults += 1; + nops += 4; + extra += cost->sse_load[3] * 2; + } + goto do_qimode; + + case V64QImode: + nmults = 2; + nops = 9; + extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2; + + do_qimode: + return ix86_vec_cost (mode, cost->mulss * nmults + + cost->sse_op * nops) + extra; + + case V4SImode: + /* pmulld is used in this case. No emulation is needed. */ + if (TARGET_SSE4_1) + goto do_native; + /* V4SImode is emulated with 7 insns. */ + else + return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5); + + case V2DImode: + case V4DImode: + /* vpmullq is used in this case. No emulation is needed. */ + if (TARGET_AVX512DQ && TARGET_AVX512VL) + goto do_native; + /* V*DImode is emulated with 6-8 insns. */ + else if (TARGET_XOP && mode == V2DImode) + return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 4); + /* FALLTHRU */ + case V8DImode: + /* vpmullq is used in this case. No emulation is needed. */ + if (TARGET_AVX512DQ && mode == V8DImode) + goto do_native; + else + return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5); + + default: + do_native: + return ix86_vec_cost (mode, cost->mulss); + } + } else return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7); } @@ -20637,16 +20671,13 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, count = 2; } else if (TARGET_AVX512BW && TARGET_AVX512VL) - { - count = 3; - return ix86_vec_cost (mode, cost->sse_op * count); - } + return ix86_vec_cost (mode, cost->sse_op * 4); else if (TARGET_SSE4_1) - count = 4; - else if (code == ASHIFTRT) count = 5; + else if (code == ASHIFTRT) + count = 6; else - count = 4; + count = 5; return ix86_vec_cost (mode, cost->sse_op * count) + extra; case V16QImode: @@ -20663,7 +20694,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, } else { - count = (code == ASHIFT) ? 2 : 3; + count = (code == ASHIFT) ? 3 : 4; return ix86_vec_cost (mode, cost->sse_op * count); } } @@ -20685,12 +20716,20 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, else count = 2; } + else if (TARGET_AVX512BW + && ((mode == V32QImode && !TARGET_PREFER_AVX256) + || (mode == V16QImode && TARGET_AVX512VL + && !TARGET_PREFER_AVX128))) + return ix86_vec_cost (mode, cost->sse_op * 4); + else if (TARGET_AVX2 + && mode == V16QImode && !TARGET_PREFER_AVX128) + count = 6; else if (TARGET_SSE4_1) - count = 8; - else if (code == ASHIFTRT) count = 9; + else if (code == ASHIFTRT) + count = 10; else - count = 8; + count = 9; return ix86_vec_cost (mode, cost->sse_op * count) + extra; case V2DImode: @@ -20704,6 +20743,8 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, count = TARGET_SSE4_2 ? 1 : 2; else if (TARGET_XOP) count = 2; + else if (TARGET_SSE4_1) + count = 3; else count = 4; } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c index 5e9f4f2805c..dc684a167c8 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c @@ -1,8 +1,7 @@ /* PR target/pr95488 */ /* { dg-do compile } */ /* { dg-options "-O2 -mavx512bw -mavx512vl" } */ -/* { dg-final { scan-assembler-times "vpmovzxbw" 4 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vpunpcklbw" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbw" 8 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */ /* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovwb" 4 { target { ! ia32 } } } } */

[COMMITTED] i386: Use 2x-wider modes when emulating QImode vector instructions

Commit Message

Comments

Patch