diff mbox series

RISC-V: Finalize VSETVL PASS implementation

Message ID 20230118032434.71273-1-juzhe.zhong@rivai.ai
State New
Headers show
Series RISC-V: Finalize VSETVL PASS implementation | expand

Commit Message

钟居哲 Jan. 18, 2023, 3:24 a.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

gcc/ChangeLog:

        * config/riscv/riscv-vsetvl.cc (vsetvl_insn_p): Add condition to avoid ICE.
        (vsetvl_discard_result_insn_p): New function.
        (reg_killed_by_bb_p): rename to find_reg_killed_by.
        (find_reg_killed_by): New name.
        (get_vl): allow it to be called by more functions.
        (has_vsetvl_killed_avl_p): Add condition.
        (get_avl): allow it to be called by more functions.
        (insn_should_be_added_p): New function.
        (get_all_nonphi_defs): Refine function.
        (get_all_sets): Ditto.
        (get_same_bb_set): New function.
        (any_insn_in_bb_p): Ditto.
        (any_set_in_bb_p): Ditto.
        (get_vl_vtype_info): Add VLMAX forward optimization.
        (source_equal_p): Fix issues.
        (extract_single_source): Refine.
        (avl_info::multiple_source_equal_p): New function.
        (avl_info::operator==): Adjust for final version.
        (vl_vtype_info::operator==): Ditto.
        (vl_vtype_info::same_avl_p): Ditto.
        (vector_insn_info::parse_insn): Ditto.
        (vector_insn_info::available_p): New function.
        (vector_insn_info::merge): Adjust for final version.
        (vector_insn_info::dump): Add hard_empty.
        (pass_vsetvl::hard_empty_block_p): New function.
        (pass_vsetvl::backward_demand_fusion): Adjust for final version.
        (pass_vsetvl::forward_demand_fusion): Ditto.
        (pass_vsetvl::demand_fusion): Ditto.
        (pass_vsetvl::cleanup_illegal_dirty_blocks): New function.
        (pass_vsetvl::compute_local_properties): Adjust for final version.
        (pass_vsetvl::can_refine_vsetvl_p): Ditto.
        (pass_vsetvl::refine_vsetvls): Ditto.
        (pass_vsetvl::commit_vsetvls): Ditto.
        (pass_vsetvl::propagate_avl): New function.
        (pass_vsetvl::lazy_vsetvl): Adjust for new version.
        * config/riscv/riscv-vsetvl.h (enum def_type): New enum.

---
 gcc/config/riscv/riscv-vsetvl.cc | 930 +++++++++++++++++++++++--------
 gcc/config/riscv/riscv-vsetvl.h  |  30 +-
 2 files changed, 737 insertions(+), 223 deletions(-)

Comments

Kito Cheng Jan. 27, 2023, 12:31 p.m. UTC | #1
committed, thanks!

On Wed, Jan 18, 2023 at 11:25 AM <juzhe.zhong@rivai.ai> wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vsetvl.cc (vsetvl_insn_p): Add condition to
> avoid ICE.
>         (vsetvl_discard_result_insn_p): New function.
>         (reg_killed_by_bb_p): rename to find_reg_killed_by.
>         (find_reg_killed_by): New name.
>         (get_vl): allow it to be called by more functions.
>         (has_vsetvl_killed_avl_p): Add condition.
>         (get_avl): allow it to be called by more functions.
>         (insn_should_be_added_p): New function.
>         (get_all_nonphi_defs): Refine function.
>         (get_all_sets): Ditto.
>         (get_same_bb_set): New function.
>         (any_insn_in_bb_p): Ditto.
>         (any_set_in_bb_p): Ditto.
>         (get_vl_vtype_info): Add VLMAX forward optimization.
>         (source_equal_p): Fix issues.
>         (extract_single_source): Refine.
>         (avl_info::multiple_source_equal_p): New function.
>         (avl_info::operator==): Adjust for final version.
>         (vl_vtype_info::operator==): Ditto.
>         (vl_vtype_info::same_avl_p): Ditto.
>         (vector_insn_info::parse_insn): Ditto.
>         (vector_insn_info::available_p): New function.
>         (vector_insn_info::merge): Adjust for final version.
>         (vector_insn_info::dump): Add hard_empty.
>         (pass_vsetvl::hard_empty_block_p): New function.
>         (pass_vsetvl::backward_demand_fusion): Adjust for final version.
>         (pass_vsetvl::forward_demand_fusion): Ditto.
>         (pass_vsetvl::demand_fusion): Ditto.
>         (pass_vsetvl::cleanup_illegal_dirty_blocks): New function.
>         (pass_vsetvl::compute_local_properties): Adjust for final version.
>         (pass_vsetvl::can_refine_vsetvl_p): Ditto.
>         (pass_vsetvl::refine_vsetvls): Ditto.
>         (pass_vsetvl::commit_vsetvls): Ditto.
>         (pass_vsetvl::propagate_avl): New function.
>         (pass_vsetvl::lazy_vsetvl): Adjust for new version.
>         * config/riscv/riscv-vsetvl.h (enum def_type): New enum.
>
> ---
>  gcc/config/riscv/riscv-vsetvl.cc | 930 +++++++++++++++++++++++--------
>  gcc/config/riscv/riscv-vsetvl.h  |  30 +-
>  2 files changed, 737 insertions(+), 223 deletions(-)
>
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc
> b/gcc/config/riscv/riscv-vsetvl.cc
> index b33c198bbd6..253bfc7b210 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -54,6 +54,8 @@ along with GCC; see the file COPYING3.  If not see
>         used any more and VL operand of VSETVL instruction if it is not
> used by
>         any non-debug instructions.
>
> +    -  Phase 6 - Propagate AVL between vsetvl instructions.
> +
>      Implementation:
>
>      -  The subroutine of optimize == 0 is simple_vsetvl.
> @@ -175,8 +177,20 @@ vector_config_insn_p (rtx_insn *rinsn)
>  static bool
>  vsetvl_insn_p (rtx_insn *rinsn)
>  {
> +  if (!vector_config_insn_p (rinsn))
> +    return false;
>    return (INSN_CODE (rinsn) == CODE_FOR_vsetvldi
> -        || INSN_CODE (rinsn) == CODE_FOR_vsetvlsi);
> +         || INSN_CODE (rinsn) == CODE_FOR_vsetvlsi);
> +}
> +
> +/* Return true if it is vsetvl zero, rs1.  */
> +static bool
> +vsetvl_discard_result_insn_p (rtx_insn *rinsn)
> +{
> +  if (!vector_config_insn_p (rinsn))
> +    return false;
> +  return (INSN_CODE (rinsn) == CODE_FOR_vsetvl_discard_resultdi
> +         || INSN_CODE (rinsn) == CODE_FOR_vsetvl_discard_resultsi);
>  }
>
>  static bool
> @@ -191,15 +205,27 @@ before_p (const insn_info *insn1, const insn_info
> *insn2)
>    return insn1->compare_with (insn2) < 0;
>  }
>
> -static bool
> -reg_killed_by_bb_p (const bb_info *bb, rtx x)
> +static insn_info *
> +find_reg_killed_by (const bb_info *bb, rtx x)
>  {
> -  if (!x || vlmax_avl_p (x))
> -    return false;
> -  for (const insn_info *insn : bb->real_nondebug_insns ())
> +  if (!x || vlmax_avl_p (x) || !REG_P (x))
> +    return nullptr;
> +  for (insn_info *insn : bb->reverse_real_nondebug_insns ())
>      if (find_access (insn->defs (), REGNO (x)))
> -      return true;
> -  return false;
> +      return insn;
> +  return nullptr;
> +}
> +
> +/* Helper function to get VL operand.  */
> +static rtx
> +get_vl (rtx_insn *rinsn)
> +{
> +  if (has_vl_op (rinsn))
> +    {
> +      extract_insn_cached (rinsn);
> +      return recog_data.operand[get_attr_vl_op_idx (rinsn)];
> +    }
> +  return SET_DEST (XVECEXP (PATTERN (rinsn), 0, 0));
>  }
>
>  static bool
> @@ -208,6 +234,9 @@ has_vsetvl_killed_avl_p (const bb_info *bb, const
> vector_insn_info &info)
>    if (info.dirty_with_killed_avl_p ())
>      {
>        rtx avl = info.get_avl ();
> +      if (vlmax_avl_p (avl))
> +       return find_reg_killed_by (bb, get_vl (info.get_insn ()->rtl ()))
> +              != nullptr;
>        for (const insn_info *insn : bb->reverse_real_nondebug_insns ())
>         {
>           def_info *def = find_access (insn->defs (), REGNO (avl));
> @@ -229,18 +258,6 @@ has_vsetvl_killed_avl_p (const bb_info *bb, const
> vector_insn_info &info)
>    return false;
>  }
>
> -/* Helper function to get VL operand.  */
> -static rtx
> -get_vl (rtx_insn *rinsn)
> -{
> -  if (has_vl_op (rinsn))
> -    {
> -      extract_insn_cached (rinsn);
> -      return recog_data.operand[get_attr_vl_op_idx (rinsn)];
> -    }
> -  return SET_DEST (XVECEXP (PATTERN (rinsn), 0, 0));
> -}
> -
>  /* An "anticipatable occurrence" is one that is the first occurrence in
> the
>     basic block, the operands are not modified in the basic block prior
>     to the occurrence and the output is not used between the start of
> @@ -419,30 +436,30 @@ backward_propagate_worthwhile_p (const basic_block
> cfg_bb,
>    return true;
>  }
>
> -/* Helper function to get AVL operand.  */
> -static rtx
> -get_avl (rtx_insn *rinsn)
> +static bool
> +insn_should_be_added_p (const insn_info *insn, unsigned int types)
>  {
> -  if (vsetvl_insn_p (rinsn))
> -    return XVECEXP (SET_SRC (XVECEXP (PATTERN (rinsn), 0, 0)), 0, 0);
> -
> -  if (!has_vl_op (rinsn))
> -    return NULL_RTX;
> -  if (get_attr_avl_type (rinsn) == VLMAX)
> -    return RVV_VLMAX;
> -  extract_insn_cached (rinsn);
> -  return recog_data.operand[get_attr_vl_op_idx (rinsn)];
> +  if (insn->is_real () && (types & REAL_SET))
> +    return true;
> +  if (insn->is_phi () && (types & PHI_SET))
> +    return true;
> +  if (insn->is_bb_head () && (types & BB_HEAD_SET))
> +    return true;
> +  if (insn->is_bb_end () && (types & BB_END_SET))
> +    return true;
> +  return false;
>  }
>
> -/* Recursively find all real define instructions if it is a real
> instruction. */
> -static hash_set<insn_info *>
> -get_all_nonphi_defs (phi_info *phi)
> +/* Recursively find all define instructions. The kind of instruction is
> +   specified by the DEF_TYPE.  */
> +static hash_set<set_info *>
> +get_all_sets (phi_info *phi, unsigned int types)
>  {
> -  hash_set<insn_info *> insns;
> +  hash_set<set_info *> insns;
>    auto_vec<phi_info *> work_list;
>    hash_set<phi_info *> visited_list;
>    if (!phi)
> -    return insns;
> +    return hash_set<set_info *> ();
>    work_list.safe_push (phi);
>
>    while (!work_list.is_empty ())
> @@ -452,20 +469,17 @@ get_all_nonphi_defs (phi_info *phi)
>        for (use_info *use : phi->inputs ())
>         {
>           def_info *def = use->def ();
> -         if (!def)
> -           {
> -             /* if def is null, treat undefined */
> -             insns.empty ();
> -             return insns;
> -           }
> +         set_info *set = safe_dyn_cast<set_info *> (def);
> +         if (!set)
> +           return hash_set<set_info *> ();
>
> -         gcc_assert (!def->insn ()->is_debug_insn ());
> +         gcc_assert (!set->insn ()->is_debug_insn ());
>
> -         if (!def->insn ()->is_phi ())
> -           insns.add (def->insn ());
> -         if (def->insn ()->is_phi ())
> +         if (insn_should_be_added_p (set->insn (), types))
> +           insns.add (set);
> +         if (set->insn ()->is_phi ())
>             {
> -             phi_info *new_phi = as_a<phi_info *> (def);
> +             phi_info *new_phi = as_a<phi_info *> (set);
>               if (!visited_list.contains (new_phi))
>                 work_list.safe_push (new_phi);
>             }
> @@ -474,6 +488,47 @@ get_all_nonphi_defs (phi_info *phi)
>    return insns;
>  }
>
> +static hash_set<set_info *>
> +get_all_sets (set_info *set, bool /* get_real_inst */ real_p,
> +             bool /*get_phi*/ phi_p, bool /* get_function_parameter*/
> param_p)
> +{
> +  if (real_p && phi_p && param_p)
> +    return get_all_sets (safe_dyn_cast<phi_info *> (set),
> +                        REAL_SET | PHI_SET | BB_HEAD_SET | BB_END_SET);
> +
> +  else if (real_p && param_p)
> +    return get_all_sets (safe_dyn_cast<phi_info *> (set),
> +                        REAL_SET | BB_HEAD_SET | BB_END_SET);
> +
> +  else if (real_p)
> +    return get_all_sets (safe_dyn_cast<phi_info *> (set), REAL_SET);
> +  return hash_set<set_info *> ();
> +}
> +
> +/* Helper function to get AVL operand.  */
> +static rtx
> +get_avl (rtx_insn *rinsn)
> +{
> +  if (vsetvl_insn_p (rinsn) || vsetvl_discard_result_insn_p (rinsn))
> +    return XVECEXP (SET_SRC (XVECEXP (PATTERN (rinsn), 0, 0)), 0, 0);
> +
> +  if (!has_vl_op (rinsn))
> +    return NULL_RTX;
> +  if (get_attr_avl_type (rinsn) == VLMAX)
> +    return RVV_VLMAX;
> +  extract_insn_cached (rinsn);
> +  return recog_data.operand[get_attr_vl_op_idx (rinsn)];
> +}
> +
> +static set_info *
> +get_same_bb_set (hash_set<set_info *> &sets, const basic_block cfg_bb)
> +{
> +  for (set_info *set : sets)
> +    if (set->bb ()->cfg_bb () == cfg_bb)
> +      return set;
> +  return nullptr;
> +}
> +
>  /* Recursively find all predecessor blocks for cfg_bb. */
>  static hash_set<basic_block>
>  get_all_predecessors (basic_block cfg_bb)
> @@ -501,10 +556,10 @@ get_all_predecessors (basic_block cfg_bb)
>
>  /* Return true if there is an INSN in insns staying in the block BB.  */
>  static bool
> -any_insn_in_bb_p (hash_set<insn_info *> insns, const bb_info *bb)
> +any_set_in_bb_p (hash_set<set_info *> sets, const bb_info *bb)
>  {
> -  for (const insn_info *insn : insns)
> -    if (insn->bb ()->index () == bb->index ())
> +  for (const set_info *set : sets)
> +    if (set->bb ()->index () == bb->index ())
>        return true;
>    return false;
>  }
> @@ -834,10 +889,6 @@ insert_insn_end_basic_block (rtx_insn *rinsn,
> basic_block cfg_bb)
>  static vl_vtype_info
>  get_vl_vtype_info (const insn_info *insn)
>  {
> -  if (vector_config_insn_p (insn->rtl ()))
> -    gcc_assert (vsetvl_insn_p (insn->rtl ())
> -               && "Can't handle X0, rs1 vsetvli yet");
> -
>    set_info *set = nullptr;
>    rtx avl = ::get_avl (insn->rtl ());
>    if (avl && REG_P (avl) && !vlmax_avl_p (avl))
> @@ -942,8 +993,12 @@ change_vsetvl_insn (const insn_info *insn, const
> vector_insn_info &info)
>  }
>
>  static bool
> -source_equal_p (rtx_insn *rinsn1, rtx_insn *rinsn2)
> +source_equal_p (insn_info *insn1, insn_info *insn2)
>  {
> +  if (!insn1 || !insn2)
> +    return false;
> +  rtx_insn *rinsn1 = insn1->rtl ();
> +  rtx_insn *rinsn2 = insn2->rtl ();
>    if (!rinsn1 || !rinsn2)
>      return false;
>    rtx note1 = find_reg_equal_equiv_note (rinsn1);
> @@ -953,40 +1008,70 @@ source_equal_p (rtx_insn *rinsn1, rtx_insn *rinsn2)
>
>    if (note1 && note2 && rtx_equal_p (note1, note2))
>      return true;
> -  if (single_set1 && single_set2
> -      && rtx_equal_p (SET_SRC (single_set1), SET_SRC (single_set2)))
> -    return true;
> -  return false;
> +
> +  /* Since vsetvl instruction is not single SET.
> +     We handle this case specially here.  */
> +  if (vsetvl_insn_p (insn1->rtl ()) && vsetvl_insn_p (insn2->rtl ()))
> +    {
> +      /* For example:
> +          vsetvl1 a6,a5,e32m1
> +          RVV 1 (use a6 as AVL)
> +          vsetvl2 a5,a5,e8mf4
> +          RVV 2 (use a5 as AVL)
> +        We consider AVL of RVV 1 and RVV 2 are same so that we can
> +        gain more optimization opportunities.
> +
> +        Note: insn1_info.compatible_avl_p (insn2_info)
> +        will make sure there is no instruction between vsetvl1 and vsetvl2
> +        modify a5 since their def will be different if there is
> instruction
> +        modify a5 and compatible_avl_p will return false.  */
> +      vector_insn_info insn1_info, insn2_info;
> +      insn1_info.parse_insn (insn1);
> +      insn2_info.parse_insn (insn2);
> +      if (insn1_info.same_vlmax_p (insn2_info)
> +         && insn1_info.compatible_avl_p (insn2_info))
> +       return true;
> +    }
> +
> +  /* We only handle AVL is set by instructions with no side effects.  */
> +  if (!single_set1 || !single_set2)
> +    return false;
> +  if (!rtx_equal_p (SET_SRC (single_set1), SET_SRC (single_set2)))
> +    return false;
> +  gcc_assert (insn1->uses ().size () == insn2->uses ().size ());
> +  for (size_t i = 0; i < insn1->uses ().size (); i++)
> +    if (insn1->uses ()[i] != insn2->uses ()[i])
> +      return false;
> +  return true;
>  }
>
>  /* Helper function to get single same real RTL source.
>     return NULL if it is not a single real RTL source.  */
> -static rtx_insn *
> +static insn_info *
>  extract_single_source (set_info *set)
>  {
>    if (!set)
>      return nullptr;
>    if (set->insn ()->is_real ())
> -    return set->insn ()->rtl ();
> +    return set->insn ();
>    if (!set->insn ()->is_phi ())
>      return nullptr;
> -  phi_info *phi = safe_dyn_cast<phi_info *> (set);
> -  hash_set<insn_info *> insns = get_all_nonphi_defs (phi);
> +  hash_set<set_info *> sets = get_all_sets (set, true, false, true);
>
> -  insn_info *first_insn = (*insns.begin ());
> +  insn_info *first_insn = (*sets.begin ())->insn ();
>    if (first_insn->is_artificial ())
>      return nullptr;
> -  for (const insn_info *insn : insns)
> +  for (const set_info *set : sets)
>      {
>        /* If there is a head or end insn, we conservative return
>          NULL so that VSETVL PASS will insert vsetvl directly.  */
> -      if (insn->is_artificial ())
> +      if (set->insn ()->is_artificial ())
>         return nullptr;
> -      if (!source_equal_p (insn->rtl (), first_insn->rtl ()))
> +      if (!source_equal_p (set->insn (), first_insn))
>         return nullptr;
>      }
>
> -  return (*insns.begin ())->rtl ();
> +  return first_insn;
>  }
>
>  avl_info::avl_info (const avl_info &other)
> @@ -1004,9 +1089,82 @@ avl_info::single_source_equal_p (const avl_info
> &other) const
>  {
>    set_info *set1 = m_source;
>    set_info *set2 = other.get_source ();
> -  rtx_insn *rinsn1 = extract_single_source (set1);
> -  rtx_insn *rinsn2 = extract_single_source (set2);
> -  return source_equal_p (rinsn1, rinsn2);
> +  insn_info *insn1 = extract_single_source (set1);
> +  insn_info *insn2 = extract_single_source (set2);
> +  if (!insn1 || !insn2)
> +    return false;
> +  return source_equal_p (insn1, insn2);
> +}
> +
> +bool
> +avl_info::multiple_source_equal_p (const avl_info &other) const
> +{
> +  /* TODO: We don't do too much optimization here since it's
> +     too complicated in case of analyzing the PHI node.
> +
> +     For example:
> +       void f (void * restrict in, void * restrict out, int n, int m, int
> cond)
> +       {
> +         size_t vl;
> +         switch (cond)
> +         {
> +         case 1:
> +           vl = 100;
> +           break;
> +         case 2:
> +           vl = *(size_t*)(in + 100);
> +           break;
> +         case 3:
> +           {
> +             size_t new_vl = *(size_t*)(in + 500);
> +             size_t new_vl2 = *(size_t*)(in + 600);
> +             vl = new_vl + new_vl2 + 777;
> +             break;
> +           }
> +         default:
> +           vl = 4000;
> +           break;
> +         }
> +         for (size_t i = 0; i < n; i++)
> +           {
> +             vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
> +             __riscv_vse8_v_i8mf8 (out + i, v, vl);
> +
> +             vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 100,
> vl);
> +             __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
> +           }
> +
> +         size_t vl2;
> +         switch (cond)
> +         {
> +         case 1:
> +           vl2 = 100;
> +           break;
> +         case 2:
> +           vl2 = *(size_t*)(in + 100);
> +           break;
> +         case 3:
> +           {
> +             size_t new_vl = *(size_t*)(in + 500);
> +             size_t new_vl2 = *(size_t*)(in + 600);
> +             vl2 = new_vl + new_vl2 + 777;
> +             break;
> +           }
> +         default:
> +           vl2 = 4000;
> +           break;
> +         }
> +         for (size_t i = 0; i < m; i++)
> +           {
> +             vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl2);
> +             __riscv_vse8_v_i8mf8 (out + i + 300, v, vl2);
> +             vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 200,
> vl2);
> +             __riscv_vse8_v_i8mf8 (out + i + 200, v2, vl2);
> +           }
> +       }
> +     Such case may not be necessary to optimize since the codes of
> defining
> +     vl and vl2 are redundant.  */
> +  return m_source == other.get_source ();
>  }
>
>  avl_info &
> @@ -1025,11 +1183,6 @@ avl_info::operator== (const avl_info &other) const
>    if (!other.get_value ())
>      return false;
>
> -  /* It's safe to consider they are equal if their RTX value are
> -     strictly the same.  */
> -  if (m_value == other.get_value ())
> -    return true;
> -
>    if (GET_CODE (m_value) != GET_CODE (other.get_value ()))
>      return false;
>
> @@ -1041,10 +1194,6 @@ avl_info::operator== (const avl_info &other) const
>    if (vlmax_avl_p (m_value))
>      return vlmax_avl_p (other.get_value ());
>
> -  /* If Pseudo REGNO are same, it's safe to consider they are same.  */
> -  if (ORIGINAL_REGNO (m_value) == ORIGINAL_REGNO (other.get_value ()))
> -    return true;
> -
>    /* If any source is undef value, we think they are not equal.  */
>    if (!m_source || !other.get_source ())
>      return false;
> @@ -1054,9 +1203,7 @@ avl_info::operator== (const avl_info &other) const
>    if (single_source_equal_p (other))
>      return true;
>
> -  /* TODO: Support avl defined by PHI which includes multiple different
> insn
> -   * later.  */
> -  return false;
> +  return multiple_source_equal_p (other);
>  }
>
>  bool
> @@ -1078,7 +1225,7 @@ vl_vtype_info::vl_vtype_info (avl_info avl_in,
> uint8_t sew_in,
>  bool
>  vl_vtype_info::operator== (const vl_vtype_info &other) const
>  {
> -  return m_avl == other.get_avl_info () && m_sew == other.get_sew ()
> +  return same_avl_p (other) && m_sew == other.get_sew ()
>          && m_vlmul == other.get_vlmul () && m_ta == other.get_ta ()
>          && m_ma == other.get_ma () && m_ratio == other.get_ratio ();
>  }
> @@ -1102,7 +1249,12 @@ vl_vtype_info::has_non_zero_avl () const
>  bool
>  vl_vtype_info::same_avl_p (const vl_vtype_info &other) const
>  {
> -  return get_avl () == other.get_avl ();
> +  /* We need to compare both RTL and SET. If both AVL are CONST_INT.
> +     For example, const_int 3 and const_int 4, we need to compare
> +     RTL. If both AVL are REG and their REGNO are same, we need to
> +     compare SET.  */
> +  return get_avl () == other.get_avl ()
> +        && get_avl_source () == other.get_avl_source ();
>  }
>
>  bool
> @@ -1283,6 +1435,25 @@ vector_insn_info::parse_insn (insn_info *insn)
>      m_demands[DEMAND_TAIL_POLICY] = true;
>    if (get_attr_ma (insn->rtl ()) != INVALID_ATTRIBUTE)
>      m_demands[DEMAND_MASK_POLICY] = true;
> +
> +  if (vector_config_insn_p (insn->rtl ()))
> +    return;
> +
> +  if (!has_avl_reg () || !m_avl.get_source ()
> +      || !m_avl.get_source ()->insn ()->is_phi ())
> +    return;
> +
> +  insn_info *def_insn = extract_single_source (m_avl.get_source ());
> +  if (def_insn)
> +    {
> +      vector_insn_info new_info;
> +      new_info.parse_insn (def_insn);
> +      if (!same_vlmax_p (new_info))
> +       return;
> +      /* TODO: Currently, we don't forward AVL for non-VLMAX vsetvl.  */
> +      if (vlmax_avl_p (new_info.get_avl ()))
> +       set_avl_info (new_info.get_avl_info ());
> +    }
>  }
>
>  void
> @@ -1396,12 +1567,21 @@ vector_insn_info::compatible_p (const
> vl_vtype_info &curr_info) const
>    return compatible_avl_p (curr_info) && compatible_vtype_p (curr_info);
>  }
>
> +bool
> +vector_insn_info::available_p (const vector_insn_info &other) const
> +{
> +  if (*this >= other)
> +    return true;
> +  return false;
> +}
> +
>  vector_insn_info
>  vector_insn_info::merge (const vector_insn_info &merge_info,
>                          enum merge_type type = LOCAL_MERGE) const
>  {
> -  gcc_assert (this->compatible_p (merge_info)
> -             && "Can't merge incompatible demanded infos");
> +  if (!vsetvl_insn_p (get_insn ()->rtl ()))
> +    gcc_assert (this->compatible_p (merge_info)
> +               && "Can't merge incompatible demanded infos");
>
>    vector_insn_info new_info;
>    new_info.demand_vl_vtype ();
> @@ -1513,6 +1693,8 @@ vector_insn_info::dump (FILE *file) const
>      fprintf (file, "UNKNOWN,");
>    else if (empty_p ())
>      fprintf (file, "EMPTY,");
> +  else if (hard_empty_p ())
> +    fprintf (file, "HARD_EMPTY,");
>    else if (dirty_with_killed_avl_p ())
>      fprintf (file, "DIRTY_WITH_KILLED_AVL,");
>    else
> @@ -1606,7 +1788,7 @@ vector_infos_manager::get_all_available_exprs (
>  {
>    auto_vec<size_t> available_list;
>    for (size_t i = 0; i < vector_exprs.length (); i++)
> -    if (info >= *vector_exprs[i])
> +    if (info.available_p (*vector_exprs[i]))
>        available_list.safe_push (i);
>    return available_list;
>  }
> @@ -1862,14 +2044,16 @@ private:
>    /* Phase 3.  */
>    enum fusion_type get_backward_fusion_type (const bb_info *,
>                                              const vector_insn_info &);
> +  bool hard_empty_block_p (const bb_info *, const vector_insn_info &)
> const;
>    bool backward_demand_fusion (void);
>    bool forward_demand_fusion (void);
> +  bool cleanup_illegal_dirty_blocks (void);
>    void demand_fusion (void);
>
>    /* Phase 4.  */
>    void prune_expressions (void);
>    void compute_local_properties (void);
> -  bool can_refine_vsetvl_p (const basic_block, uint8_t) const;
> +  bool can_refine_vsetvl_p (const basic_block, const vector_insn_info &)
> const;
>    void refine_vsetvls (void) const;
>    void cleanup_vsetvls (void);
>    bool commit_vsetvls (void);
> @@ -1878,6 +2062,9 @@ private:
>    /* Phase 5.  */
>    void cleanup_insns (void) const;
>
> +  /* Phase 6.  */
> +  void propagate_avl (void) const;
> +
>    void init (void);
>    void done (void);
>    void compute_probabilities (void);
> @@ -2079,134 +2266,241 @@ pass_vsetvl::get_backward_fusion_type (const
> bb_info *bb,
>
>    gcc_assert (reg);
>    def_info *def = find_access (insn->uses (), REGNO (reg))->def ();
> -  if (def->insn ()->is_phi ())
> +  if (!def->insn ()->is_phi () && def->insn ()->bb () == insn->bb ())
> +    return INVALID_FUSION;
> +  hash_set<set_info *> sets
> +    = get_all_sets (prop.get_avl_source (), true, true, true);
> +  if (any_set_in_bb_p (sets, insn->bb ()))
> +    return INVALID_FUSION;
> +
> +  if (vlmax_avl_p (prop.get_avl ()))
>      {
> -      hash_set<insn_info *> insns
> -       = get_all_nonphi_defs (as_a<phi_info *> (def));
> -      if (any_insn_in_bb_p (insns, insn->bb ()))
> +      if (find_reg_killed_by (bb, reg))
>         return INVALID_FUSION;
> +      else
> +       return VALID_AVL_FUSION;
>      }
> -  else
> -    {
> -      if (def->insn ()->bb () == insn->bb ())
> -       return INVALID_FUSION;
> +
> +  /* By default, we always enable backward fusion so that we can
> +     gain more optimizations.  */
> +  if (!find_reg_killed_by (bb, reg))
> +    return VALID_AVL_FUSION;
> +  return KILLED_AVL_FUSION;
> +}
> +
> +/* We almost enable all cases in get_backward_fusion_type, this function
> +   disable the backward fusion by changing dirty blocks into hard empty
> +   blocks in forward dataflow. We can have more accurate optimization by
> +   this method.  */
> +bool
> +pass_vsetvl::hard_empty_block_p (const bb_info *bb,
> +                                const vector_insn_info &info) const
> +{
> +  if (!info.dirty_p () || !info.has_avl_reg ())
> +    return false;
> +
> +  basic_block cfg_bb = bb->cfg_bb ();
> +  sbitmap avin = m_vector_manager->vector_avin[cfg_bb->index];
> +  rtx avl = vlmax_avl_p (info.get_avl ()) ? get_vl (info.get_insn ()->rtl
> ())
> +                                         : get_avl (info.get_insn ()->rtl
> ());
> +  insn_info *insn = info.get_insn ();
> +  set_info *set = find_access (insn->uses (), REGNO (avl))->def ();
> +  hash_set<set_info *> sets = get_all_sets (set, true, false, false);
> +  hash_set<basic_block> pred_cfg_bbs = get_all_predecessors (cfg_bb);
> +
> +  if (find_reg_killed_by (bb, avl))
> +    {
> +      /* Condition 1:
> +        Dirty block with killed AVL means that the empty block (no RVV
> +        instructions) are polluted as Dirty blocks with the value of
> current
> +        AVL is killed. For example:
> +             bb 0:
> +               ...
> +             bb 1:
> +               def a5
> +             bb 2:
> +               RVV (use a5)
> +        In backward dataflow, we will polluted BB0 and BB1 as Dirt with
> AVL
> +        killed. since a5 is killed in BB1.
> +        In this case, let's take a look at this example:
> +
> +             bb 3:        bb 4:
> +               def3 a5       def4 a5
> +             bb 5:        bb 6:
> +               def1 a5       def2 a5
> +                   \         /
> +                    \       /
> +                     \     /
> +                      \   /
> +                       bb 7:
> +                   RVV (use a5)
> +        In thi case, we can polluted BB5 and BB6 as dirty if get-def
> +        of a5 from RVV instruction in BB7 is the def1 in BB5 and
> +        def2 BB6 so we can return false early here for HARD_EMPTY_BLOCK_P.
> +        However, we are not sure whether BB3 and BB4 can be
> +        polluted as Dirty with AVL killed so we can't return false
> +        for HARD_EMPTY_BLOCK_P here since it's too early which will
> +        potentially produce issues.  */
> +      gcc_assert (info.dirty_with_killed_avl_p ());
> +      if (info.get_avl_source ()
> +         && get_same_bb_set (sets, bb->cfg_bb ()) == info.get_avl_source
> ())
> +       return false;
>      }
>
> -  rtx new_reg = gen_rtx_REG (GET_MODE (reg), REGNO (reg));
> -  gcc_assert (new_reg != reg);
> -  const avl_info info = avl_info (new_reg, safe_dyn_cast<set_info *>
> (def));
> -  if (prop.dirty_with_killed_avl_p ())
> +  /* Condition 2:
> +     Suppress the VL/VTYPE info backward propagation too early:
> +                        ________
> +                       |   BB0  |
> +                       |________|
> +                           |
> +                       ____|____
> +                       |   BB1  |
> +                       |________|
> +     In this case, suppose BB 1 has multiple predecessors, BB 0 is one
> +     of them. BB1 has VL/VTYPE info (may be VALID or DIRTY) to backward
> +     propagate.
> +     The AVIN (available in) which is calculated by LCM is empty only
> +     in these 2 circumstances:
> +       1. all predecessors of BB1 are empty (not VALID
> +         and can not be polluted in backward fusion flow)
> +       2. VL/VTYPE info of BB1 predecessors are conflict.
> +
> +     We keep it as dirty in 2nd circumstance and set it as HARD_EMPTY
> +     (can not be polluted as DIRTY any more) in 1st circumstance.
> +     We don't backward propagate in 1st circumstance since there is
> +     no VALID RVV instruction and no polluted blocks (dirty blocks)
> +     by backward propagation from other following blocks.
> +     It's meaningless to keep it as Dirty anymore.
> +
> +     However, since we keep it as dirty in 2nd since there are VALID or
> +     Dirty blocks in predecessors, we can still gain the benefits and
> +     optimization opportunities. For example, in this case:
> +       for (size_t i = 0; i < n; i++)
> +        {
> +          if (i != cond) {
> +            vint8mf8_t v = *(vint8mf8_t*)(in + i + 100);
> +            *(vint8mf8_t*)(out + i + 100) = v;
> +          } else {
> +            vbool1_t v = *(vbool1_t*)(in + i + 400);
> +            *(vbool1_t*)(out + i + 400) = v;
> +          }
> +        }
> +     VL/VTYPE in if-else are conflict which will produce empty AVIN LCM
> result
> +     but we can still keep dirty blocks if *(i != cond)* is very unlikely
> then
> +     we can preset vsetvl (VL/VTYPE) info from else (static propability
> model).
> +
> +     We don't want to backward propagate VL/VTYPE information too early
> +     which is not the optimal and may potentially produce issues.  */
> +  if (bitmap_empty_p (avin))
>      {
> -      unsigned int bb_index;
> -      sbitmap_iterator sbi;
> -      sbitmap bitdata = m_vector_manager->vector_avout[bb->index ()];
> -      bool has_valid_avl = false;
> -      EXECUTE_IF_SET_IN_BITMAP (bitdata, 0, bb_index, sbi)
> -      {
> -       const vector_insn_info *expr =
> m_vector_manager->vector_exprs[bb_index];
> -       if (expr->compatible_avl_p (info))
> -         {
> -           has_valid_avl = true;
> -           break;
> -         }
> -      }
> -      if (!has_valid_avl)
> -       return INVALID_FUSION;
> +      bool hard_empty_p = true;
> +      for (const basic_block pred_cfg_bb : pred_cfg_bbs)
> +       {
> +         if (pred_cfg_bb == ENTRY_BLOCK_PTR_FOR_FN (cfun))
> +           continue;
> +         sbitmap avout =
> m_vector_manager->vector_avout[pred_cfg_bb->index];
> +         if (!bitmap_empty_p (avout))
> +           {
> +             hard_empty_p = false;
> +             break;
> +           }
> +       }
> +      if (hard_empty_p)
> +       return true;
>      }
>
> -  if (reg_killed_by_bb_p (bb, reg))
> +  edge e;
> +  edge_iterator ei;
> +  bool has_avl_killed_insn_p = false;
> +  FOR_EACH_EDGE (e, ei, cfg_bb->succs)
>      {
> -      unsigned int bb_index;
> -      sbitmap_iterator sbi;
> -      sbitmap bitdata = m_vector_manager->vector_avin[bb->index ()];
> -      hash_set<basic_block> blocks = get_all_predecessors (bb->cfg_bb ());
> -      for (const auto block : blocks)
> -       if (block == insn->bb ()->cfg_bb ())
> -         return INVALID_FUSION;
> -      if (bitmap_empty_p (bitdata))
> +      const auto block_info
> +       = m_vector_manager->vector_block_infos[e->dest->index];
> +      if (block_info.local_dem.dirty_with_killed_avl_p ())
>         {
> -         /* void f (int8_t *restrict in, int8_t *restrict out, int n, int
> m,
> -                   unsigned cond, size_t vl)
> -         {
> -           vbool64_t mask = *(vbool64_t *) (in + 1000000);
> -
> -           vl = 101;
> -           if (cond > 0)
> -             {
> -             vint8mf8_t v = __riscv_vle8_v_i8mf8 (in, vl);
> -             __riscv_vse8_v_i8mf8 (out, v, vl);
> -             }
> -           else
> -             {
> -             out[100] = out[100] + 300;
> -             }
> -
> -           for (size_t i = 0; i < n; i++)
> -             {
> -             vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((in + i + 200),
> vl);
> -             __riscv_vse32_v_f32mf2 ((out + i + 200), v, vl);
> +         has_avl_killed_insn_p = true;
> +         break;
> +       }
> +    }
> +  if (!has_avl_killed_insn_p)
> +    return false;
>
> -             vfloat32mf2_t v2
> -               = __riscv_vle32_v_f32mf2_tumu (mask, v, (in + i + 300),
> vl);
> -             __riscv_vse32_v_f32mf2_m (mask, (out + i + 300), v2, vl);
> -             }
> -         }  */
> -         for (const auto block : blocks)
> +  bool any_set_in_bbs_p = false;
> +  for (const basic_block pred_cfg_bb : pred_cfg_bbs)
> +    {
> +      insn_info *def_insn = extract_single_source (set);
> +      if (def_insn)
> +       {
> +         /* Condition 3:
> +
> +           Case 1:                               Case 2:
> +               bb 0:                                 bb 0:
> +                 def a5 101                             ...
> +               bb 1:                                 bb 1:
> +                 ...                                    ...
> +               bb 2:                                 bb 2:
> +                 RVV 1 (use a5 with TAIL ANY)           ...
> +               bb 3:                                 bb 3:
> +                 def a5 101                             def a5 101
> +               bb 4:                                 bb 4:
> +                 ...                                    ...
> +               bb 5:                                 bb 5:
> +                 RVV 2 (use a5 with TU)                 RVV 1 (use a5)
> +
> +           Case 1: We can pollute BB3,BB2,BB1,BB0 are all Dirt blocks
> +           with killed AVL so that we can merge TU demand info from RVV 2
> +           into RVV 1 and elide the vsevl instruction in BB5.
> +
> +           TODO: We only optimize for single source def since multiple
> source
> +           def is quite complicated.
> +
> +           Case 2: We only can pollute bb 3 as dirty and it has been
> accepted
> +           in Condition 2 and we can't pollute BB3,BB2,BB1,BB0 like case
> 1. */
> +         insn_info *last_killed_insn
> +           = find_reg_killed_by (crtl->ssa->bb (pred_cfg_bb), avl);
> +         if (!last_killed_insn || pred_cfg_bb == def_insn->bb ()->cfg_bb
> ())
> +           continue;
> +         if (source_equal_p (last_killed_insn, def_insn))
>             {
> -             if (block == ENTRY_BLOCK_PTR_FOR_FN (cfun))
> -               continue;
> -             sbitmap avout = m_vector_manager->vector_avout[block->index];
> -             EXECUTE_IF_SET_IN_BITMAP (avout, 0, bb_index, sbi)
> -             {
> -               const vector_insn_info *expr
> -                 = m_vector_manager->vector_exprs[bb_index];
> -               if (expr->compatible_avl_p (info))
> -                 return KILLED_AVL_FUSION;
> -             }
> +             any_set_in_bbs_p = true;
> +             break;
>             }
> -         return INVALID_FUSION;
>         }
>        else
>         {
> -         /* void f (int8_t * restrict in, int8_t * restrict out, int n,
> int
> -             m, unsigned cond, size_t vl)
> -             {
> -               vbool64_t mask = *(vbool64_t *) (in + 1000000);
> -
> -               vl = 101;
> -               if (cond > 0)
> -                 {
> -                 vint8mf8_t v = __riscv_vle8_v_i8mf8 (in, vl);
> -                 __riscv_vse8_v_i8mf8 (out, v, vl);
> -                 }
> -               else
> -                 {
> -                 vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + 1000, vl);
> -                 __riscv_vse8_v_i8mf8 (out + 1000, v, vl);
> -                 }
> -
> -               for (size_t i = 0; i < n; i++)
> -                 {
> -                 vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((in + i +
> 200), vl);
> -                 __riscv_vse32_v_f32mf2 ((out + i + 200), v, vl);
> -
> -                 vfloat32mf2_t v2
> -                   = __riscv_vle32_v_f32mf2_tumu (mask, v, (in + i +
> 300), vl);
> -                 __riscv_vse32_v_f32mf2_m (mask, (out + i + 300), v2, vl);
> -                 }
> -         }  */
> -         EXECUTE_IF_SET_IN_BITMAP (bitdata, 0, bb_index, sbi)
> -         {
> -           const vector_insn_info *expr
> -             = m_vector_manager->vector_exprs[bb_index];
> -           if (expr->compatible_avl_p (info))
> -             return KILLED_AVL_FUSION;
> -         }
> +         /* Condition 4:
> +
> +             bb 0:        bb 1:         bb 3:
> +               def1 a5       def2 a5     ...
> +                   \         /            /
> +                    \       /            /
> +                     \     /            /
> +                      \   /            /
> +                       bb 4:          /
> +                        |            /
> +                        |           /
> +                       bb 5:       /
> +                        |         /
> +                        |        /
> +                       bb 6:    /
> +                        |      /
> +                        |     /
> +                         bb 8:
> +                       RVV 1 (use a5)
> +         If we get-def (REAL) of a5 from RVV 1 instruction, we will get
> +         def1 from BB0 and def2 from BB1. So we will pollute BB6,BB5,BB4,
> +         BB0,BB1 with DIRTY and set BB3 as HARD_EMPTY so that we won't
> +         propagate AVL to BB3.  */
> +         if (any_set_in_bb_p (sets, crtl->ssa->bb (pred_cfg_bb)))
> +           {
> +             any_set_in_bbs_p = true;
> +             break;
> +           }
>         }
> -      return INVALID_FUSION;
>      }
> -
> -  return prop.dirty_with_killed_avl_p () ? KILLED_AVL_FUSION :
> VALID_AVL_FUSION;
> +  if (!any_set_in_bbs_p)
> +    return true;
> +  return false;
>  }
>
>  /* Compute global backward demanded info.  */
> @@ -2272,6 +2566,8 @@ pass_vsetvl::backward_demand_fusion (void)
>
>           if (block_info.reaching_out.unknown_p ())
>             continue;
> +         else if (block_info.reaching_out.hard_empty_p ())
> +           continue;
>           else if (block_info.reaching_out.empty_p ())
>             {
>               enum fusion_type type
> @@ -2281,6 +2577,17 @@ pass_vsetvl::backward_demand_fusion (void)
>
>               block_info.reaching_out = prop;
>               block_info.reaching_out.set_dirty (type);
> +
> +             if (prop.has_avl_reg () && !vlmax_avl_p (prop.get_avl ()))
> +               {
> +                 hash_set<set_info *> sets
> +                   = get_all_sets (prop.get_avl_source (), true, true,
> true);
> +                 set_info *set = get_same_bb_set (sets, e->src);
> +                 if (set)
> +                   block_info.reaching_out.set_avl_info (
> +                     avl_info (prop.get_avl (), set));
> +               }
> +
>               block_info.local_dem = block_info.reaching_out;
>               block_info.probability = curr_block_info.probability;
>               changed_p = true;
> @@ -2294,22 +2601,28 @@ pass_vsetvl::backward_demand_fusion (void)
>                 {
>                   if (block_info.reaching_out >= prop)
>                     continue;
> -                 block_info.probability += curr_block_info.probability;
>                   new_info = block_info.reaching_out.merge (prop,
> GLOBAL_MERGE);
> +                 new_info.set_dirty (
> +                   block_info.reaching_out.dirty_with_killed_avl_p ());
> +                 block_info.probability += curr_block_info.probability;
>                 }
>               else
>                 {
>                   if (curr_block_info.probability > block_info.probability)
>                     {
> +                     enum fusion_type type
> +                       = get_backward_fusion_type (crtl->ssa->bb (e->src),
> +                                                   prop);
> +                     if (type == INVALID_FUSION)
> +                       continue;
>                       new_info = prop;
> +                     new_info.set_dirty (type);
>                       block_info.probability = curr_block_info.probability;
>                     }
>                   else
>                     continue;
>                 }
>
> -             new_info.set_dirty (
> -               block_info.reaching_out.dirty_with_killed_avl_p ());
>               block_info.local_dem = new_info;
>               block_info.reaching_out = new_info;
>               changed_p = true;
> @@ -2319,10 +2632,28 @@ pass_vsetvl::backward_demand_fusion (void)
>               /* We not only change the info during backward propagation,
>                  but also change the VSETVL instruction.  */
>               gcc_assert (block_info.reaching_out.valid_p ());
> -             if (!block_info.reaching_out.compatible_p (prop))
> -               continue;
> -             if (block_info.reaching_out >= prop)
> -               continue;
> +             hash_set<set_info *> sets
> +               = get_all_sets (prop.get_avl_source (), true, false,
> false);
> +             set_info *set = get_same_bb_set (sets, e->src);
> +             if (vsetvl_insn_p (block_info.reaching_out.get_insn ()->rtl
> ())
> +                 && prop.has_avl_reg () && !vlmax_avl_p (prop.get_avl ()))
> +               {
> +                 if (!block_info.reaching_out.same_vlmax_p (prop))
> +                   continue;
> +                 if (block_info.reaching_out.same_vtype_p (prop))
> +                   continue;
> +                 if (!set)
> +                   continue;
> +                 if (set->insn () != block_info.reaching_out.get_insn ())
> +                   continue;
> +               }
> +             else
> +               {
> +                 if (!block_info.reaching_out.compatible_p (prop))
> +                   continue;
> +                 if (block_info.reaching_out >= prop)
> +                   continue;
> +               }
>
>               vector_insn_info be_merged = block_info.reaching_out;
>               if (block_info.local_dem == block_info.reaching_out)
> @@ -2410,8 +2741,8 @@ pass_vsetvl::forward_demand_fusion (void)
>           if (local_dem.dirty_p ())
>             {
>               gcc_assert (local_dem == reaching_out);
> +             new_info.set_dirty (local_dem.dirty_with_killed_avl_p ());
>               local_dem = new_info;
> -             local_dem.set_dirty (local_dem.dirty_with_killed_avl_p ());
>               reaching_out = local_dem;
>             }
>           else
> @@ -2439,9 +2770,6 @@ pass_vsetvl::demand_fusion (void)
>    while (changed_p)
>      {
>        changed_p = false;
> -      prune_expressions ();
> -      m_vector_manager->create_bitmap_vectors ();
> -      compute_local_properties ();
>        /* To optimize the case like this:
>          void f2 (int8_t * restrict in, int8_t * restrict out, int n, int
> cond)
>            {
> @@ -2475,12 +2803,22 @@ pass_vsetvl::demand_fusion (void)
>         an AVL kill instruction in bb 2 that we can't backward fuse bb 3 or
>         forward bb 1 arbitrarily. We need available information of each
> block to
>         help for such cases.  */
> +      changed_p |= backward_demand_fusion ();
> +      changed_p |= forward_demand_fusion ();
> +    }
> +
> +  changed_p = true;
> +  while (changed_p)
> +    {
> +      changed_p = false;
> +      prune_expressions ();
> +      m_vector_manager->create_bitmap_vectors ();
> +      compute_local_properties ();
>        compute_available (m_vector_manager->vector_comp,
>                          m_vector_manager->vector_kill,
>                          m_vector_manager->vector_avout,
>                          m_vector_manager->vector_avin);
> -      changed_p |= backward_demand_fusion ();
> -      changed_p |= forward_demand_fusion ();
> +      changed_p |= cleanup_illegal_dirty_blocks ();
>        m_vector_manager->free_bitmap_vectors ();
>        if (!m_vector_manager->vector_exprs.is_empty ())
>         m_vector_manager->vector_exprs.release ();
> @@ -2498,6 +2836,34 @@ pass_vsetvl::demand_fusion (void)
>      }
>  }
>
> +/* Cleanup illegal dirty blocks.  */
> +bool
> +pass_vsetvl::cleanup_illegal_dirty_blocks (void)
> +{
> +  bool changed_p = false;
> +  for (const bb_info *bb : crtl->ssa->bbs ())
> +    {
> +      basic_block cfg_bb = bb->cfg_bb ();
> +      const auto &prop
> +       = m_vector_manager->vector_block_infos[cfg_bb->index].reaching_out;
> +
> +      /* If there is nothing to cleanup, just skip it.  */
> +      if (!prop.valid_or_dirty_p ())
> +       continue;
> +
> +      if (hard_empty_block_p (bb, prop))
> +       {
> +         m_vector_manager->vector_block_infos[cfg_bb->index].local_dem
> +           = vector_insn_info::get_hard_empty ();
> +         m_vector_manager->vector_block_infos[cfg_bb->index].reaching_out
> +           = vector_insn_info::get_hard_empty ();
> +         changed_p = true;
> +         continue;
> +       }
> +    }
> +  return changed_p;
> +}
> +
>  /* Assemble the candidates expressions for LCM.  */
>  void
>  pass_vsetvl::prune_expressions (void)
> @@ -2614,18 +2980,20 @@ pass_vsetvl::compute_local_properties (void)
>              as long as the all real def insn of avl do not come from this
>              block. This configuration may be still missing some
> optimization
>              opportunities.  */
> -         if (reg_killed_by_bb_p (bb, expr->get_avl ()))
> +         if (find_reg_killed_by (bb, expr->get_avl ()))
>             {
> -             hash_set<insn_info *> insns = get_all_nonphi_defs (
> -               safe_dyn_cast<phi_info *> (expr->get_avl_source ()));
> -             if (any_insn_in_bb_p (insns, bb))
> +             hash_set<set_info *> sets
> +               = get_all_sets (expr->get_avl_source (), true, false,
> false);
> +             if (any_set_in_bb_p (sets, bb))
>                 bitmap_clear_bit
> (m_vector_manager->vector_transp[curr_bb_idx],
>                                   i);
>             }
>         }
>
>        /* Compute anticipatable occurrences.  */
> -      if (local_dem.valid_p () || local_dem.real_dirty_p ())
> +      if (local_dem.valid_p () || local_dem.real_dirty_p ()
> +         || (has_vsetvl_killed_avl_p (bb, local_dem)
> +             && vlmax_avl_p (local_dem.get_avl ())))
>         if (anticipatable_occurrence_p (bb, local_dem))
>           bitmap_set_bit (m_vector_manager->vector_antic[curr_bb_idx],
>                           m_vector_manager->get_expr_id (local_dem));
> @@ -2693,7 +3061,8 @@ pass_vsetvl::compute_local_properties (void)
>
>  /* Return true if VSETVL in the block can be refined as vsetvl
> zero,zero.  */
>  bool
> -pass_vsetvl::can_refine_vsetvl_p (const basic_block cfg_bb, uint8_t
> ratio) const
> +pass_vsetvl::can_refine_vsetvl_p (const basic_block cfg_bb,
> +                                 const vector_insn_info &info) const
>  {
>    if (!m_vector_manager->all_same_ratio_p (
>         m_vector_manager->vector_avin[cfg_bb->index]))
> @@ -2705,7 +3074,9 @@ pass_vsetvl::can_refine_vsetvl_p (const basic_block
> cfg_bb, uint8_t ratio) const
>
>    size_t expr_id
>      = bitmap_first_set_bit (m_vector_manager->vector_avin[cfg_bb->index]);
> -  if (m_vector_manager->vector_exprs[expr_id]->get_ratio () != ratio)
> +  if (!m_vector_manager->vector_exprs[expr_id]->same_vlmax_p (info))
> +    return false;
> +  if (!m_vector_manager->vector_exprs[expr_id]->compatible_avl_p (info))
>      return false;
>
>    edge e;
> @@ -2748,7 +3119,7 @@ pass_vsetvl::refine_vsetvls (void) const
>         continue;
>
>        rtx_insn *rinsn = insn->rtl ();
> -      if (!can_refine_vsetvl_p (cfg_bb, info.get_ratio ()))
> +      if (!can_refine_vsetvl_p (cfg_bb, info))
>         continue;
>
>        if (!vector_config_insn_p (rinsn))
> @@ -2871,7 +3242,7 @@ pass_vsetvl::commit_vsetvls (void)
>         }
>
>        rtx new_pat;
> -      if (can_refine_vsetvl_p (cfg_bb, reaching_out.get_ratio ()))
> +      if (can_refine_vsetvl_p (cfg_bb, reaching_out))
>         new_pat
>           = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, reaching_out,
> NULL_RTX);
>        else if (vlmax_avl_p (reaching_out.get_avl ()))
> @@ -2965,6 +3336,115 @@ pass_vsetvl::cleanup_insns (void) const
>      }
>  }
>
> +void
> +pass_vsetvl::propagate_avl (void) const
> +{
> +  /* Rebuild the RTL_SSA according to the new CFG generated by LCM.  */
> +  /* Finalization of RTL_SSA.  */
> +  free_dominance_info (CDI_DOMINATORS);
> +  if (crtl->ssa->perform_pending_updates ())
> +    cleanup_cfg (0);
> +  delete crtl->ssa;
> +  crtl->ssa = nullptr;
> +  /* Initialization of RTL_SSA.  */
> +  calculate_dominance_info (CDI_DOMINATORS);
> +  df_analyze ();
> +  crtl->ssa = new function_info (cfun);
> +
> +  hash_set<rtx_insn *> to_delete;
> +  for (const bb_info *bb : crtl->ssa->bbs ())
> +    {
> +      for (insn_info *insn : bb->real_nondebug_insns ())
> +       {
> +         if (vsetvl_discard_result_insn_p (insn->rtl ()))
> +           {
> +             rtx avl = get_avl (insn->rtl ());
> +             if (!REG_P (avl))
> +               continue;
> +
> +             set_info *set = find_access (insn->uses (), REGNO
> (avl))->def ();
> +             insn_info *def_insn = extract_single_source (set);
> +             if (!def_insn)
> +               continue;
> +
> +             /* Handle this case:
> +                vsetvli        a6,zero,e32,m1,ta,mu
> +                li     a5,4096
> +                add    a7,a0,a5
> +                addi   a7,a7,-96
> +                vsetvli        t1,zero,e8,mf8,ta,ma
> +                vle8.v v24,0(a7)
> +                add    a5,a3,a5
> +                addi   a5,a5,-96
> +                vse8.v v24,0(a5)
> +                vsetvli        zero,a6,e32,m1,tu,ma
> +             */
> +             if (vsetvl_insn_p (def_insn->rtl ()))
> +               {
> +                 vl_vtype_info def_info = get_vl_vtype_info (def_insn);
> +                 vl_vtype_info info = get_vl_vtype_info (insn);
> +                 rtx avl = get_avl (def_insn->rtl ());
> +                 rtx vl = get_vl (def_insn->rtl ());
> +                 if (def_info.get_ratio () == info.get_ratio ())
> +                   {
> +                     if (vlmax_avl_p (def_info.get_avl ()))
> +                       {
> +                         info.set_avl_info (
> +                           avl_info (def_info.get_avl (), nullptr));
> +                         rtx new_pat
> +                           = gen_vsetvl_pat (VSETVL_NORMAL, info, vl);
> +                         validate_change (insn->rtl (),
> +                                          &PATTERN (insn->rtl ()),
> new_pat,
> +                                          false);
> +                         continue;
> +                       }
> +                     if (def_info.has_avl_imm () || rtx_equal_p (avl, vl))
> +                       {
> +                         info.set_avl_info (avl_info (avl, nullptr));
> +                         emit_vsetvl_insn (VSETVL_DISCARD_RESULT,
> EMIT_AFTER,
> +                                           info, NULL_RTX, insn->rtl ());
> +                         if (set->single_nondebug_insn_use ())
> +                           {
> +                             to_delete.add (insn->rtl ());
> +                             to_delete.add (def_insn->rtl ());
> +                           }
> +                         continue;
> +                       }
> +                   }
> +               }
> +           }
> +
> +         /* Change vsetvl rd, rs1 --> vsevl zero, rs1,
> +            if rd is not used by any nondebug instructions.
> +            Even though this PASS runs after RA and it doesn't help for
> +            reduce register pressure, it can help instructions scheduling
> +            since we remove the dependencies.  */
> +         if (vsetvl_insn_p (insn->rtl ()))
> +           {
> +             rtx vl = get_vl (insn->rtl ());
> +             rtx avl = get_avl (insn->rtl ());
> +             if (vlmax_avl_p (avl))
> +               continue;
> +             def_info *def = find_access (insn->defs (), REGNO (vl));
> +             set_info *set = safe_dyn_cast<set_info *> (def);
> +             gcc_assert (set);
> +             const vl_vtype_info info = get_vl_vtype_info (insn);
> +             rtx new_pat
> +               = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, info, NULL_RTX);
> +             if (!set->has_nondebug_insn_uses ())
> +               {
> +                 validate_change (insn->rtl (), &PATTERN (insn->rtl ()),
> +                                  new_pat, false);
> +                 continue;
> +               }
> +           }
> +       }
> +    }
> +
> +  for (rtx_insn *rinsn : to_delete)
> +    eliminate_insn (rinsn);
> +}
> +
>  void
>  pass_vsetvl::init (void)
>  {
> @@ -3083,6 +3563,12 @@ pass_vsetvl::lazy_vsetvl (void)
>    if (dump_file)
>      fprintf (dump_file, "\nPhase 5: Cleanup AVL and VL operands\n");
>    cleanup_insns ();
> +
> +  /* Phase 6 - Rebuild RTL_SSA to propagate AVL between vsetvls.  */
> +  if (dump_file)
> +    fprintf (dump_file,
> +            "\nPhase 6: Rebuild RTL_SSA to propagate AVL between
> vsetvls\n");
> +  propagate_avl ();
>  }
>
>  /* Main entry point for this pass.  */
> diff --git a/gcc/config/riscv/riscv-vsetvl.h
> b/gcc/config/riscv/riscv-vsetvl.h
> index 3b68bf638ae..4177b3e851b 100644
> --- a/gcc/config/riscv/riscv-vsetvl.h
> +++ b/gcc/config/riscv/riscv-vsetvl.h
> @@ -65,6 +65,21 @@ enum merge_type
>    GLOBAL_MERGE
>  };
>
> +enum def_type
> +{
> +  REAL_SET = 1 << 0,
> +  PHI_SET = 1 << 1,
> +  BB_HEAD_SET = 1 << 2,
> +  BB_END_SET = 1 << 3,
> +  /* ??? TODO: In RTL_SSA framework, we have REAL_SET,
> +     PHI_SET, BB_HEAD_SET, BB_END_SET and
> +     CLOBBER_DEF def_info types. Currently,
> +     we conservatively do not optimize clobber
> +     def since we don't see the case that we
> +     need to optimize it.  */
> +  CLOBBER_DEF = 1 << 4
> +};
> +
>  /* AVL info for RVV instruction. Most RVV instructions have AVL operand in
>     implicit dependency. The AVL comparison between 2 RVV instructions is
>     very important since it affects our decision whether we should insert
> @@ -143,6 +158,7 @@ public:
>    rtx get_value () const { return m_value; }
>    rtl_ssa::set_info *get_source () const { return m_source; }
>    bool single_source_equal_p (const avl_info &) const;
> +  bool multiple_source_equal_p (const avl_info &) const;
>    avl_info &operator= (const avl_info &);
>    bool operator== (const avl_info &) const;
>    bool operator!= (const avl_info &) const;
> @@ -210,6 +226,8 @@ private:
>      VALID,
>      UNKNOWN,
>      EMPTY,
> +    /* The empty block can not be polluted as dirty.  */
> +    HARD_EMPTY,
>
>      /* The block is polluted as containing VSETVL instruction during dem
>         backward propagation to gain better LCM optimization even though
> @@ -280,7 +298,8 @@ public:
>    bool uninit_p () const { return m_state == UNINITIALIZED; }
>    bool valid_p () const { return m_state == VALID; }
>    bool unknown_p () const { return m_state == UNKNOWN; }
> -  bool empty_p () const { return m_state == EMPTY; }
> +  bool empty_p () const { return m_state == EMPTY || m_state ==
> HARD_EMPTY; }
> +  bool hard_empty_p () const { return m_state == HARD_EMPTY; }
>    bool dirty_p () const
>    {
>      return m_state == DIRTY || m_state == DIRTY_WITH_KILLED_AVL;
> @@ -295,6 +314,7 @@ public:
>      return m_state == VALID || m_state == DIRTY
>            || m_state == DIRTY_WITH_KILLED_AVL;
>    }
> +  bool available_p (const vector_insn_info &) const;
>
>    static vector_insn_info get_unknown ()
>    {
> @@ -303,9 +323,17 @@ public:
>      return info;
>    }
>
> +  static vector_insn_info get_hard_empty ()
> +  {
> +    vector_insn_info info;
> +    info.set_hard_empty ();
> +    return info;
> +  }
> +
>    void set_valid () { m_state = VALID; }
>    void set_unknown () { m_state = UNKNOWN; }
>    void set_empty () { m_state = EMPTY; }
> +  void set_hard_empty () { m_state = HARD_EMPTY; }
>    void set_dirty (enum fusion_type type)
>    {
>      gcc_assert (type == VALID_AVL_FUSION || type == KILLED_AVL_FUSION);
> --
> 2.36.3
>
>
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index b33c198bbd6..253bfc7b210 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -54,6 +54,8 @@  along with GCC; see the file COPYING3.  If not see
        used any more and VL operand of VSETVL instruction if it is not used by
        any non-debug instructions.
 
+    -  Phase 6 - Propagate AVL between vsetvl instructions.
+
     Implementation:
 
     -  The subroutine of optimize == 0 is simple_vsetvl.
@@ -175,8 +177,20 @@  vector_config_insn_p (rtx_insn *rinsn)
 static bool
 vsetvl_insn_p (rtx_insn *rinsn)
 {
+  if (!vector_config_insn_p (rinsn))
+    return false;
   return (INSN_CODE (rinsn) == CODE_FOR_vsetvldi
-	 || INSN_CODE (rinsn) == CODE_FOR_vsetvlsi);
+	  || INSN_CODE (rinsn) == CODE_FOR_vsetvlsi);
+}
+
+/* Return true if it is vsetvl zero, rs1.  */
+static bool
+vsetvl_discard_result_insn_p (rtx_insn *rinsn)
+{
+  if (!vector_config_insn_p (rinsn))
+    return false;
+  return (INSN_CODE (rinsn) == CODE_FOR_vsetvl_discard_resultdi
+	  || INSN_CODE (rinsn) == CODE_FOR_vsetvl_discard_resultsi);
 }
 
 static bool
@@ -191,15 +205,27 @@  before_p (const insn_info *insn1, const insn_info *insn2)
   return insn1->compare_with (insn2) < 0;
 }
 
-static bool
-reg_killed_by_bb_p (const bb_info *bb, rtx x)
+static insn_info *
+find_reg_killed_by (const bb_info *bb, rtx x)
 {
-  if (!x || vlmax_avl_p (x))
-    return false;
-  for (const insn_info *insn : bb->real_nondebug_insns ())
+  if (!x || vlmax_avl_p (x) || !REG_P (x))
+    return nullptr;
+  for (insn_info *insn : bb->reverse_real_nondebug_insns ())
     if (find_access (insn->defs (), REGNO (x)))
-      return true;
-  return false;
+      return insn;
+  return nullptr;
+}
+
+/* Helper function to get VL operand.  */
+static rtx
+get_vl (rtx_insn *rinsn)
+{
+  if (has_vl_op (rinsn))
+    {
+      extract_insn_cached (rinsn);
+      return recog_data.operand[get_attr_vl_op_idx (rinsn)];
+    }
+  return SET_DEST (XVECEXP (PATTERN (rinsn), 0, 0));
 }
 
 static bool
@@ -208,6 +234,9 @@  has_vsetvl_killed_avl_p (const bb_info *bb, const vector_insn_info &info)
   if (info.dirty_with_killed_avl_p ())
     {
       rtx avl = info.get_avl ();
+      if (vlmax_avl_p (avl))
+	return find_reg_killed_by (bb, get_vl (info.get_insn ()->rtl ()))
+	       != nullptr;
       for (const insn_info *insn : bb->reverse_real_nondebug_insns ())
 	{
 	  def_info *def = find_access (insn->defs (), REGNO (avl));
@@ -229,18 +258,6 @@  has_vsetvl_killed_avl_p (const bb_info *bb, const vector_insn_info &info)
   return false;
 }
 
-/* Helper function to get VL operand.  */
-static rtx
-get_vl (rtx_insn *rinsn)
-{
-  if (has_vl_op (rinsn))
-    {
-      extract_insn_cached (rinsn);
-      return recog_data.operand[get_attr_vl_op_idx (rinsn)];
-    }
-  return SET_DEST (XVECEXP (PATTERN (rinsn), 0, 0));
-}
-
 /* An "anticipatable occurrence" is one that is the first occurrence in the
    basic block, the operands are not modified in the basic block prior
    to the occurrence and the output is not used between the start of
@@ -419,30 +436,30 @@  backward_propagate_worthwhile_p (const basic_block cfg_bb,
   return true;
 }
 
-/* Helper function to get AVL operand.  */
-static rtx
-get_avl (rtx_insn *rinsn)
+static bool
+insn_should_be_added_p (const insn_info *insn, unsigned int types)
 {
-  if (vsetvl_insn_p (rinsn))
-    return XVECEXP (SET_SRC (XVECEXP (PATTERN (rinsn), 0, 0)), 0, 0);
-
-  if (!has_vl_op (rinsn))
-    return NULL_RTX;
-  if (get_attr_avl_type (rinsn) == VLMAX)
-    return RVV_VLMAX;
-  extract_insn_cached (rinsn);
-  return recog_data.operand[get_attr_vl_op_idx (rinsn)];
+  if (insn->is_real () && (types & REAL_SET))
+    return true;
+  if (insn->is_phi () && (types & PHI_SET))
+    return true;
+  if (insn->is_bb_head () && (types & BB_HEAD_SET))
+    return true;
+  if (insn->is_bb_end () && (types & BB_END_SET))
+    return true;
+  return false;
 }
 
-/* Recursively find all real define instructions if it is a real instruction. */
-static hash_set<insn_info *>
-get_all_nonphi_defs (phi_info *phi)
+/* Recursively find all define instructions. The kind of instruction is
+   specified by the DEF_TYPE.  */
+static hash_set<set_info *>
+get_all_sets (phi_info *phi, unsigned int types)
 {
-  hash_set<insn_info *> insns;
+  hash_set<set_info *> insns;
   auto_vec<phi_info *> work_list;
   hash_set<phi_info *> visited_list;
   if (!phi)
-    return insns;
+    return hash_set<set_info *> ();
   work_list.safe_push (phi);
 
   while (!work_list.is_empty ())
@@ -452,20 +469,17 @@  get_all_nonphi_defs (phi_info *phi)
       for (use_info *use : phi->inputs ())
 	{
 	  def_info *def = use->def ();
-	  if (!def)
-	    {
-	      /* if def is null, treat undefined */
-	      insns.empty ();
-	      return insns;
-	    }
+	  set_info *set = safe_dyn_cast<set_info *> (def);
+	  if (!set)
+	    return hash_set<set_info *> ();
 
-	  gcc_assert (!def->insn ()->is_debug_insn ());
+	  gcc_assert (!set->insn ()->is_debug_insn ());
 
-	  if (!def->insn ()->is_phi ())
-	    insns.add (def->insn ());
-	  if (def->insn ()->is_phi ())
+	  if (insn_should_be_added_p (set->insn (), types))
+	    insns.add (set);
+	  if (set->insn ()->is_phi ())
 	    {
-	      phi_info *new_phi = as_a<phi_info *> (def);
+	      phi_info *new_phi = as_a<phi_info *> (set);
 	      if (!visited_list.contains (new_phi))
 		work_list.safe_push (new_phi);
 	    }
@@ -474,6 +488,47 @@  get_all_nonphi_defs (phi_info *phi)
   return insns;
 }
 
+static hash_set<set_info *>
+get_all_sets (set_info *set, bool /* get_real_inst */ real_p,
+	      bool /*get_phi*/ phi_p, bool /* get_function_parameter*/ param_p)
+{
+  if (real_p && phi_p && param_p)
+    return get_all_sets (safe_dyn_cast<phi_info *> (set),
+			 REAL_SET | PHI_SET | BB_HEAD_SET | BB_END_SET);
+
+  else if (real_p && param_p)
+    return get_all_sets (safe_dyn_cast<phi_info *> (set),
+			 REAL_SET | BB_HEAD_SET | BB_END_SET);
+
+  else if (real_p)
+    return get_all_sets (safe_dyn_cast<phi_info *> (set), REAL_SET);
+  return hash_set<set_info *> ();
+}
+
+/* Helper function to get AVL operand.  */
+static rtx
+get_avl (rtx_insn *rinsn)
+{
+  if (vsetvl_insn_p (rinsn) || vsetvl_discard_result_insn_p (rinsn))
+    return XVECEXP (SET_SRC (XVECEXP (PATTERN (rinsn), 0, 0)), 0, 0);
+
+  if (!has_vl_op (rinsn))
+    return NULL_RTX;
+  if (get_attr_avl_type (rinsn) == VLMAX)
+    return RVV_VLMAX;
+  extract_insn_cached (rinsn);
+  return recog_data.operand[get_attr_vl_op_idx (rinsn)];
+}
+
+static set_info *
+get_same_bb_set (hash_set<set_info *> &sets, const basic_block cfg_bb)
+{
+  for (set_info *set : sets)
+    if (set->bb ()->cfg_bb () == cfg_bb)
+      return set;
+  return nullptr;
+}
+
 /* Recursively find all predecessor blocks for cfg_bb. */
 static hash_set<basic_block>
 get_all_predecessors (basic_block cfg_bb)
@@ -501,10 +556,10 @@  get_all_predecessors (basic_block cfg_bb)
 
 /* Return true if there is an INSN in insns staying in the block BB.  */
 static bool
-any_insn_in_bb_p (hash_set<insn_info *> insns, const bb_info *bb)
+any_set_in_bb_p (hash_set<set_info *> sets, const bb_info *bb)
 {
-  for (const insn_info *insn : insns)
-    if (insn->bb ()->index () == bb->index ())
+  for (const set_info *set : sets)
+    if (set->bb ()->index () == bb->index ())
       return true;
   return false;
 }
@@ -834,10 +889,6 @@  insert_insn_end_basic_block (rtx_insn *rinsn, basic_block cfg_bb)
 static vl_vtype_info
 get_vl_vtype_info (const insn_info *insn)
 {
-  if (vector_config_insn_p (insn->rtl ()))
-    gcc_assert (vsetvl_insn_p (insn->rtl ())
-		&& "Can't handle X0, rs1 vsetvli yet");
-
   set_info *set = nullptr;
   rtx avl = ::get_avl (insn->rtl ());
   if (avl && REG_P (avl) && !vlmax_avl_p (avl))
@@ -942,8 +993,12 @@  change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info)
 }
 
 static bool
-source_equal_p (rtx_insn *rinsn1, rtx_insn *rinsn2)
+source_equal_p (insn_info *insn1, insn_info *insn2)
 {
+  if (!insn1 || !insn2)
+    return false;
+  rtx_insn *rinsn1 = insn1->rtl ();
+  rtx_insn *rinsn2 = insn2->rtl ();
   if (!rinsn1 || !rinsn2)
     return false;
   rtx note1 = find_reg_equal_equiv_note (rinsn1);
@@ -953,40 +1008,70 @@  source_equal_p (rtx_insn *rinsn1, rtx_insn *rinsn2)
 
   if (note1 && note2 && rtx_equal_p (note1, note2))
     return true;
-  if (single_set1 && single_set2
-      && rtx_equal_p (SET_SRC (single_set1), SET_SRC (single_set2)))
-    return true;
-  return false;
+
+  /* Since vsetvl instruction is not single SET.
+     We handle this case specially here.  */
+  if (vsetvl_insn_p (insn1->rtl ()) && vsetvl_insn_p (insn2->rtl ()))
+    {
+      /* For example:
+	   vsetvl1 a6,a5,e32m1
+	   RVV 1 (use a6 as AVL)
+	   vsetvl2 a5,a5,e8mf4
+	   RVV 2 (use a5 as AVL)
+	 We consider AVL of RVV 1 and RVV 2 are same so that we can
+	 gain more optimization opportunities.
+
+	 Note: insn1_info.compatible_avl_p (insn2_info)
+	 will make sure there is no instruction between vsetvl1 and vsetvl2
+	 modify a5 since their def will be different if there is instruction
+	 modify a5 and compatible_avl_p will return false.  */
+      vector_insn_info insn1_info, insn2_info;
+      insn1_info.parse_insn (insn1);
+      insn2_info.parse_insn (insn2);
+      if (insn1_info.same_vlmax_p (insn2_info)
+	  && insn1_info.compatible_avl_p (insn2_info))
+	return true;
+    }
+
+  /* We only handle AVL is set by instructions with no side effects.  */
+  if (!single_set1 || !single_set2)
+    return false;
+  if (!rtx_equal_p (SET_SRC (single_set1), SET_SRC (single_set2)))
+    return false;
+  gcc_assert (insn1->uses ().size () == insn2->uses ().size ());
+  for (size_t i = 0; i < insn1->uses ().size (); i++)
+    if (insn1->uses ()[i] != insn2->uses ()[i])
+      return false;
+  return true;
 }
 
 /* Helper function to get single same real RTL source.
    return NULL if it is not a single real RTL source.  */
-static rtx_insn *
+static insn_info *
 extract_single_source (set_info *set)
 {
   if (!set)
     return nullptr;
   if (set->insn ()->is_real ())
-    return set->insn ()->rtl ();
+    return set->insn ();
   if (!set->insn ()->is_phi ())
     return nullptr;
-  phi_info *phi = safe_dyn_cast<phi_info *> (set);
-  hash_set<insn_info *> insns = get_all_nonphi_defs (phi);
+  hash_set<set_info *> sets = get_all_sets (set, true, false, true);
 
-  insn_info *first_insn = (*insns.begin ());
+  insn_info *first_insn = (*sets.begin ())->insn ();
   if (first_insn->is_artificial ())
     return nullptr;
-  for (const insn_info *insn : insns)
+  for (const set_info *set : sets)
     {
       /* If there is a head or end insn, we conservative return
 	 NULL so that VSETVL PASS will insert vsetvl directly.  */
-      if (insn->is_artificial ())
+      if (set->insn ()->is_artificial ())
 	return nullptr;
-      if (!source_equal_p (insn->rtl (), first_insn->rtl ()))
+      if (!source_equal_p (set->insn (), first_insn))
 	return nullptr;
     }
 
-  return (*insns.begin ())->rtl ();
+  return first_insn;
 }
 
 avl_info::avl_info (const avl_info &other)
@@ -1004,9 +1089,82 @@  avl_info::single_source_equal_p (const avl_info &other) const
 {
   set_info *set1 = m_source;
   set_info *set2 = other.get_source ();
-  rtx_insn *rinsn1 = extract_single_source (set1);
-  rtx_insn *rinsn2 = extract_single_source (set2);
-  return source_equal_p (rinsn1, rinsn2);
+  insn_info *insn1 = extract_single_source (set1);
+  insn_info *insn2 = extract_single_source (set2);
+  if (!insn1 || !insn2)
+    return false;
+  return source_equal_p (insn1, insn2);
+}
+
+bool
+avl_info::multiple_source_equal_p (const avl_info &other) const
+{
+  /* TODO: We don't do too much optimization here since it's
+     too complicated in case of analyzing the PHI node.
+
+     For example:
+       void f (void * restrict in, void * restrict out, int n, int m, int cond)
+	{
+	  size_t vl;
+	  switch (cond)
+	  {
+	  case 1:
+	    vl = 100;
+	    break;
+	  case 2:
+	    vl = *(size_t*)(in + 100);
+	    break;
+	  case 3:
+	    {
+	      size_t new_vl = *(size_t*)(in + 500);
+	      size_t new_vl2 = *(size_t*)(in + 600);
+	      vl = new_vl + new_vl2 + 777;
+	      break;
+	    }
+	  default:
+	    vl = 4000;
+	    break;
+	  }
+	  for (size_t i = 0; i < n; i++)
+	    {
+	      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
+	      __riscv_vse8_v_i8mf8 (out + i, v, vl);
+
+	      vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 100, vl);
+	      __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
+	    }
+
+	  size_t vl2;
+	  switch (cond)
+	  {
+	  case 1:
+	    vl2 = 100;
+	    break;
+	  case 2:
+	    vl2 = *(size_t*)(in + 100);
+	    break;
+	  case 3:
+	    {
+	      size_t new_vl = *(size_t*)(in + 500);
+	      size_t new_vl2 = *(size_t*)(in + 600);
+	      vl2 = new_vl + new_vl2 + 777;
+	      break;
+	    }
+	  default:
+	    vl2 = 4000;
+	    break;
+	  }
+	  for (size_t i = 0; i < m; i++)
+	    {
+	      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl2);
+	      __riscv_vse8_v_i8mf8 (out + i + 300, v, vl2);
+	      vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 200, vl2);
+	      __riscv_vse8_v_i8mf8 (out + i + 200, v2, vl2);
+	    }
+	}
+     Such case may not be necessary to optimize since the codes of defining
+     vl and vl2 are redundant.  */
+  return m_source == other.get_source ();
 }
 
 avl_info &
@@ -1025,11 +1183,6 @@  avl_info::operator== (const avl_info &other) const
   if (!other.get_value ())
     return false;
 
-  /* It's safe to consider they are equal if their RTX value are
-     strictly the same.  */
-  if (m_value == other.get_value ())
-    return true;
-
   if (GET_CODE (m_value) != GET_CODE (other.get_value ()))
     return false;
 
@@ -1041,10 +1194,6 @@  avl_info::operator== (const avl_info &other) const
   if (vlmax_avl_p (m_value))
     return vlmax_avl_p (other.get_value ());
 
-  /* If Pseudo REGNO are same, it's safe to consider they are same.  */
-  if (ORIGINAL_REGNO (m_value) == ORIGINAL_REGNO (other.get_value ()))
-    return true;
-
   /* If any source is undef value, we think they are not equal.  */
   if (!m_source || !other.get_source ())
     return false;
@@ -1054,9 +1203,7 @@  avl_info::operator== (const avl_info &other) const
   if (single_source_equal_p (other))
     return true;
 
-  /* TODO: Support avl defined by PHI which includes multiple different insn
-   * later.  */
-  return false;
+  return multiple_source_equal_p (other);
 }
 
 bool
@@ -1078,7 +1225,7 @@  vl_vtype_info::vl_vtype_info (avl_info avl_in, uint8_t sew_in,
 bool
 vl_vtype_info::operator== (const vl_vtype_info &other) const
 {
-  return m_avl == other.get_avl_info () && m_sew == other.get_sew ()
+  return same_avl_p (other) && m_sew == other.get_sew ()
 	 && m_vlmul == other.get_vlmul () && m_ta == other.get_ta ()
 	 && m_ma == other.get_ma () && m_ratio == other.get_ratio ();
 }
@@ -1102,7 +1249,12 @@  vl_vtype_info::has_non_zero_avl () const
 bool
 vl_vtype_info::same_avl_p (const vl_vtype_info &other) const
 {
-  return get_avl () == other.get_avl ();
+  /* We need to compare both RTL and SET. If both AVL are CONST_INT.
+     For example, const_int 3 and const_int 4, we need to compare
+     RTL. If both AVL are REG and their REGNO are same, we need to
+     compare SET.  */
+  return get_avl () == other.get_avl ()
+	 && get_avl_source () == other.get_avl_source ();
 }
 
 bool
@@ -1283,6 +1435,25 @@  vector_insn_info::parse_insn (insn_info *insn)
     m_demands[DEMAND_TAIL_POLICY] = true;
   if (get_attr_ma (insn->rtl ()) != INVALID_ATTRIBUTE)
     m_demands[DEMAND_MASK_POLICY] = true;
+
+  if (vector_config_insn_p (insn->rtl ()))
+    return;
+
+  if (!has_avl_reg () || !m_avl.get_source ()
+      || !m_avl.get_source ()->insn ()->is_phi ())
+    return;
+
+  insn_info *def_insn = extract_single_source (m_avl.get_source ());
+  if (def_insn)
+    {
+      vector_insn_info new_info;
+      new_info.parse_insn (def_insn);
+      if (!same_vlmax_p (new_info))
+	return;
+      /* TODO: Currently, we don't forward AVL for non-VLMAX vsetvl.  */
+      if (vlmax_avl_p (new_info.get_avl ()))
+	set_avl_info (new_info.get_avl_info ());
+    }
 }
 
 void
@@ -1396,12 +1567,21 @@  vector_insn_info::compatible_p (const vl_vtype_info &curr_info) const
   return compatible_avl_p (curr_info) && compatible_vtype_p (curr_info);
 }
 
+bool
+vector_insn_info::available_p (const vector_insn_info &other) const
+{
+  if (*this >= other)
+    return true;
+  return false;
+}
+
 vector_insn_info
 vector_insn_info::merge (const vector_insn_info &merge_info,
 			 enum merge_type type = LOCAL_MERGE) const
 {
-  gcc_assert (this->compatible_p (merge_info)
-	      && "Can't merge incompatible demanded infos");
+  if (!vsetvl_insn_p (get_insn ()->rtl ()))
+    gcc_assert (this->compatible_p (merge_info)
+		&& "Can't merge incompatible demanded infos");
 
   vector_insn_info new_info;
   new_info.demand_vl_vtype ();
@@ -1513,6 +1693,8 @@  vector_insn_info::dump (FILE *file) const
     fprintf (file, "UNKNOWN,");
   else if (empty_p ())
     fprintf (file, "EMPTY,");
+  else if (hard_empty_p ())
+    fprintf (file, "HARD_EMPTY,");
   else if (dirty_with_killed_avl_p ())
     fprintf (file, "DIRTY_WITH_KILLED_AVL,");
   else
@@ -1606,7 +1788,7 @@  vector_infos_manager::get_all_available_exprs (
 {
   auto_vec<size_t> available_list;
   for (size_t i = 0; i < vector_exprs.length (); i++)
-    if (info >= *vector_exprs[i])
+    if (info.available_p (*vector_exprs[i]))
       available_list.safe_push (i);
   return available_list;
 }
@@ -1862,14 +2044,16 @@  private:
   /* Phase 3.  */
   enum fusion_type get_backward_fusion_type (const bb_info *,
 					     const vector_insn_info &);
+  bool hard_empty_block_p (const bb_info *, const vector_insn_info &) const;
   bool backward_demand_fusion (void);
   bool forward_demand_fusion (void);
+  bool cleanup_illegal_dirty_blocks (void);
   void demand_fusion (void);
 
   /* Phase 4.  */
   void prune_expressions (void);
   void compute_local_properties (void);
-  bool can_refine_vsetvl_p (const basic_block, uint8_t) const;
+  bool can_refine_vsetvl_p (const basic_block, const vector_insn_info &) const;
   void refine_vsetvls (void) const;
   void cleanup_vsetvls (void);
   bool commit_vsetvls (void);
@@ -1878,6 +2062,9 @@  private:
   /* Phase 5.  */
   void cleanup_insns (void) const;
 
+  /* Phase 6.  */
+  void propagate_avl (void) const;
+
   void init (void);
   void done (void);
   void compute_probabilities (void);
@@ -2079,134 +2266,241 @@  pass_vsetvl::get_backward_fusion_type (const bb_info *bb,
 
   gcc_assert (reg);
   def_info *def = find_access (insn->uses (), REGNO (reg))->def ();
-  if (def->insn ()->is_phi ())
+  if (!def->insn ()->is_phi () && def->insn ()->bb () == insn->bb ())
+    return INVALID_FUSION;
+  hash_set<set_info *> sets
+    = get_all_sets (prop.get_avl_source (), true, true, true);
+  if (any_set_in_bb_p (sets, insn->bb ()))
+    return INVALID_FUSION;
+
+  if (vlmax_avl_p (prop.get_avl ()))
     {
-      hash_set<insn_info *> insns
-	= get_all_nonphi_defs (as_a<phi_info *> (def));
-      if (any_insn_in_bb_p (insns, insn->bb ()))
+      if (find_reg_killed_by (bb, reg))
 	return INVALID_FUSION;
+      else
+	return VALID_AVL_FUSION;
     }
-  else
-    {
-      if (def->insn ()->bb () == insn->bb ())
-	return INVALID_FUSION;
+
+  /* By default, we always enable backward fusion so that we can
+     gain more optimizations.  */
+  if (!find_reg_killed_by (bb, reg))
+    return VALID_AVL_FUSION;
+  return KILLED_AVL_FUSION;
+}
+
+/* We almost enable all cases in get_backward_fusion_type, this function
+   disable the backward fusion by changing dirty blocks into hard empty
+   blocks in forward dataflow. We can have more accurate optimization by
+   this method.  */
+bool
+pass_vsetvl::hard_empty_block_p (const bb_info *bb,
+				 const vector_insn_info &info) const
+{
+  if (!info.dirty_p () || !info.has_avl_reg ())
+    return false;
+
+  basic_block cfg_bb = bb->cfg_bb ();
+  sbitmap avin = m_vector_manager->vector_avin[cfg_bb->index];
+  rtx avl = vlmax_avl_p (info.get_avl ()) ? get_vl (info.get_insn ()->rtl ())
+					  : get_avl (info.get_insn ()->rtl ());
+  insn_info *insn = info.get_insn ();
+  set_info *set = find_access (insn->uses (), REGNO (avl))->def ();
+  hash_set<set_info *> sets = get_all_sets (set, true, false, false);
+  hash_set<basic_block> pred_cfg_bbs = get_all_predecessors (cfg_bb);
+
+  if (find_reg_killed_by (bb, avl))
+    {
+      /* Condition 1:
+	 Dirty block with killed AVL means that the empty block (no RVV
+	 instructions) are polluted as Dirty blocks with the value of current
+	 AVL is killed. For example:
+	      bb 0:
+		...
+	      bb 1:
+		def a5
+	      bb 2:
+		RVV (use a5)
+	 In backward dataflow, we will polluted BB0 and BB1 as Dirt with AVL
+	 killed. since a5 is killed in BB1.
+	 In this case, let's take a look at this example:
+
+	      bb 3:        bb 4:
+		def3 a5       def4 a5
+	      bb 5:        bb 6:
+		def1 a5       def2 a5
+		    \         /
+		     \       /
+		      \     /
+		       \   /
+			bb 7:
+		    RVV (use a5)
+	 In thi case, we can polluted BB5 and BB6 as dirty if get-def
+	 of a5 from RVV instruction in BB7 is the def1 in BB5 and
+	 def2 BB6 so we can return false early here for HARD_EMPTY_BLOCK_P.
+	 However, we are not sure whether BB3 and BB4 can be
+	 polluted as Dirty with AVL killed so we can't return false
+	 for HARD_EMPTY_BLOCK_P here since it's too early which will
+	 potentially produce issues.  */
+      gcc_assert (info.dirty_with_killed_avl_p ());
+      if (info.get_avl_source ()
+	  && get_same_bb_set (sets, bb->cfg_bb ()) == info.get_avl_source ())
+	return false;
     }
 
-  rtx new_reg = gen_rtx_REG (GET_MODE (reg), REGNO (reg));
-  gcc_assert (new_reg != reg);
-  const avl_info info = avl_info (new_reg, safe_dyn_cast<set_info *> (def));
-  if (prop.dirty_with_killed_avl_p ())
+  /* Condition 2:
+     Suppress the VL/VTYPE info backward propagation too early:
+			 ________
+			|   BB0  |
+			|________|
+			    |
+			____|____
+			|   BB1  |
+			|________|
+     In this case, suppose BB 1 has multiple predecessors, BB 0 is one
+     of them. BB1 has VL/VTYPE info (may be VALID or DIRTY) to backward
+     propagate.
+     The AVIN (available in) which is calculated by LCM is empty only
+     in these 2 circumstances:
+       1. all predecessors of BB1 are empty (not VALID
+	  and can not be polluted in backward fusion flow)
+       2. VL/VTYPE info of BB1 predecessors are conflict.
+
+     We keep it as dirty in 2nd circumstance and set it as HARD_EMPTY
+     (can not be polluted as DIRTY any more) in 1st circumstance.
+     We don't backward propagate in 1st circumstance since there is
+     no VALID RVV instruction and no polluted blocks (dirty blocks)
+     by backward propagation from other following blocks.
+     It's meaningless to keep it as Dirty anymore.
+
+     However, since we keep it as dirty in 2nd since there are VALID or
+     Dirty blocks in predecessors, we can still gain the benefits and
+     optimization opportunities. For example, in this case:
+	for (size_t i = 0; i < n; i++)
+	 {
+	   if (i != cond) {
+	     vint8mf8_t v = *(vint8mf8_t*)(in + i + 100);
+	     *(vint8mf8_t*)(out + i + 100) = v;
+	   } else {
+	     vbool1_t v = *(vbool1_t*)(in + i + 400);
+	     *(vbool1_t*)(out + i + 400) = v;
+	   }
+	 }
+     VL/VTYPE in if-else are conflict which will produce empty AVIN LCM result
+     but we can still keep dirty blocks if *(i != cond)* is very unlikely then
+     we can preset vsetvl (VL/VTYPE) info from else (static propability model).
+
+     We don't want to backward propagate VL/VTYPE information too early
+     which is not the optimal and may potentially produce issues.  */
+  if (bitmap_empty_p (avin))
     {
-      unsigned int bb_index;
-      sbitmap_iterator sbi;
-      sbitmap bitdata = m_vector_manager->vector_avout[bb->index ()];
-      bool has_valid_avl = false;
-      EXECUTE_IF_SET_IN_BITMAP (bitdata, 0, bb_index, sbi)
-      {
-	const vector_insn_info *expr = m_vector_manager->vector_exprs[bb_index];
-	if (expr->compatible_avl_p (info))
-	  {
-	    has_valid_avl = true;
-	    break;
-	  }
-      }
-      if (!has_valid_avl)
-	return INVALID_FUSION;
+      bool hard_empty_p = true;
+      for (const basic_block pred_cfg_bb : pred_cfg_bbs)
+	{
+	  if (pred_cfg_bb == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+	    continue;
+	  sbitmap avout = m_vector_manager->vector_avout[pred_cfg_bb->index];
+	  if (!bitmap_empty_p (avout))
+	    {
+	      hard_empty_p = false;
+	      break;
+	    }
+	}
+      if (hard_empty_p)
+	return true;
     }
 
-  if (reg_killed_by_bb_p (bb, reg))
+  edge e;
+  edge_iterator ei;
+  bool has_avl_killed_insn_p = false;
+  FOR_EACH_EDGE (e, ei, cfg_bb->succs)
     {
-      unsigned int bb_index;
-      sbitmap_iterator sbi;
-      sbitmap bitdata = m_vector_manager->vector_avin[bb->index ()];
-      hash_set<basic_block> blocks = get_all_predecessors (bb->cfg_bb ());
-      for (const auto block : blocks)
-	if (block == insn->bb ()->cfg_bb ())
-	  return INVALID_FUSION;
-      if (bitmap_empty_p (bitdata))
+      const auto block_info
+	= m_vector_manager->vector_block_infos[e->dest->index];
+      if (block_info.local_dem.dirty_with_killed_avl_p ())
 	{
-	  /* void f (int8_t *restrict in, int8_t *restrict out, int n, int m,
-		    unsigned cond, size_t vl)
-	  {
-	    vbool64_t mask = *(vbool64_t *) (in + 1000000);
-
-	    vl = 101;
-	    if (cond > 0)
-	      {
-	      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in, vl);
-	      __riscv_vse8_v_i8mf8 (out, v, vl);
-	      }
-	    else
-	      {
-	      out[100] = out[100] + 300;
-	      }
-
-	    for (size_t i = 0; i < n; i++)
-	      {
-	      vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((in + i + 200), vl);
-	      __riscv_vse32_v_f32mf2 ((out + i + 200), v, vl);
+	  has_avl_killed_insn_p = true;
+	  break;
+	}
+    }
+  if (!has_avl_killed_insn_p)
+    return false;
 
-	      vfloat32mf2_t v2
-		= __riscv_vle32_v_f32mf2_tumu (mask, v, (in + i + 300), vl);
-	      __riscv_vse32_v_f32mf2_m (mask, (out + i + 300), v2, vl);
-	      }
-	  }  */
-	  for (const auto block : blocks)
+  bool any_set_in_bbs_p = false;
+  for (const basic_block pred_cfg_bb : pred_cfg_bbs)
+    {
+      insn_info *def_insn = extract_single_source (set);
+      if (def_insn)
+	{
+	  /* Condition 3:
+
+	    Case 1:                               Case 2:
+		bb 0:                                 bb 0:
+		  def a5 101                             ...
+		bb 1:                                 bb 1:
+		  ...                                    ...
+		bb 2:                                 bb 2:
+		  RVV 1 (use a5 with TAIL ANY)           ...
+		bb 3:                                 bb 3:
+		  def a5 101                             def a5 101
+		bb 4:                                 bb 4:
+		  ...                                    ...
+		bb 5:                                 bb 5:
+		  RVV 2 (use a5 with TU)                 RVV 1 (use a5)
+
+	    Case 1: We can pollute BB3,BB2,BB1,BB0 are all Dirt blocks
+	    with killed AVL so that we can merge TU demand info from RVV 2
+	    into RVV 1 and elide the vsevl instruction in BB5.
+
+	    TODO: We only optimize for single source def since multiple source
+	    def is quite complicated.
+
+	    Case 2: We only can pollute bb 3 as dirty and it has been accepted
+	    in Condition 2 and we can't pollute BB3,BB2,BB1,BB0 like case 1. */
+	  insn_info *last_killed_insn
+	    = find_reg_killed_by (crtl->ssa->bb (pred_cfg_bb), avl);
+	  if (!last_killed_insn || pred_cfg_bb == def_insn->bb ()->cfg_bb ())
+	    continue;
+	  if (source_equal_p (last_killed_insn, def_insn))
 	    {
-	      if (block == ENTRY_BLOCK_PTR_FOR_FN (cfun))
-		continue;
-	      sbitmap avout = m_vector_manager->vector_avout[block->index];
-	      EXECUTE_IF_SET_IN_BITMAP (avout, 0, bb_index, sbi)
-	      {
-		const vector_insn_info *expr
-		  = m_vector_manager->vector_exprs[bb_index];
-		if (expr->compatible_avl_p (info))
-		  return KILLED_AVL_FUSION;
-	      }
+	      any_set_in_bbs_p = true;
+	      break;
 	    }
-	  return INVALID_FUSION;
 	}
       else
 	{
-	  /* void f (int8_t * restrict in, int8_t * restrict out, int n, int
-	      m, unsigned cond, size_t vl)
-	      {
-		vbool64_t mask = *(vbool64_t *) (in + 1000000);
-
-		vl = 101;
-		if (cond > 0)
-		  {
-		  vint8mf8_t v = __riscv_vle8_v_i8mf8 (in, vl);
-		  __riscv_vse8_v_i8mf8 (out, v, vl);
-		  }
-		else
-		  {
-		  vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + 1000, vl);
-		  __riscv_vse8_v_i8mf8 (out + 1000, v, vl);
-		  }
-
-		for (size_t i = 0; i < n; i++)
-		  {
-		  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((in + i + 200), vl);
-		  __riscv_vse32_v_f32mf2 ((out + i + 200), v, vl);
-
-		  vfloat32mf2_t v2
-		    = __riscv_vle32_v_f32mf2_tumu (mask, v, (in + i + 300), vl);
-		  __riscv_vse32_v_f32mf2_m (mask, (out + i + 300), v2, vl);
-		  }
-	  }  */
-	  EXECUTE_IF_SET_IN_BITMAP (bitdata, 0, bb_index, sbi)
-	  {
-	    const vector_insn_info *expr
-	      = m_vector_manager->vector_exprs[bb_index];
-	    if (expr->compatible_avl_p (info))
-	      return KILLED_AVL_FUSION;
-	  }
+	  /* Condition 4:
+
+	      bb 0:        bb 1:         bb 3:
+		def1 a5       def2 a5     ...
+		    \         /            /
+		     \       /            /
+		      \     /            /
+		       \   /            /
+			bb 4:          /
+			 |            /
+			 |           /
+			bb 5:       /
+			 |         /
+			 |        /
+			bb 6:    /
+			 |      /
+			 |     /
+			  bb 8:
+			RVV 1 (use a5)
+	  If we get-def (REAL) of a5 from RVV 1 instruction, we will get
+	  def1 from BB0 and def2 from BB1. So we will pollute BB6,BB5,BB4,
+	  BB0,BB1 with DIRTY and set BB3 as HARD_EMPTY so that we won't
+	  propagate AVL to BB3.  */
+	  if (any_set_in_bb_p (sets, crtl->ssa->bb (pred_cfg_bb)))
+	    {
+	      any_set_in_bbs_p = true;
+	      break;
+	    }
 	}
-      return INVALID_FUSION;
     }
-
-  return prop.dirty_with_killed_avl_p () ? KILLED_AVL_FUSION : VALID_AVL_FUSION;
+  if (!any_set_in_bbs_p)
+    return true;
+  return false;
 }
 
 /* Compute global backward demanded info.  */
@@ -2272,6 +2566,8 @@  pass_vsetvl::backward_demand_fusion (void)
 
 	  if (block_info.reaching_out.unknown_p ())
 	    continue;
+	  else if (block_info.reaching_out.hard_empty_p ())
+	    continue;
 	  else if (block_info.reaching_out.empty_p ())
 	    {
 	      enum fusion_type type
@@ -2281,6 +2577,17 @@  pass_vsetvl::backward_demand_fusion (void)
 
 	      block_info.reaching_out = prop;
 	      block_info.reaching_out.set_dirty (type);
+
+	      if (prop.has_avl_reg () && !vlmax_avl_p (prop.get_avl ()))
+		{
+		  hash_set<set_info *> sets
+		    = get_all_sets (prop.get_avl_source (), true, true, true);
+		  set_info *set = get_same_bb_set (sets, e->src);
+		  if (set)
+		    block_info.reaching_out.set_avl_info (
+		      avl_info (prop.get_avl (), set));
+		}
+
 	      block_info.local_dem = block_info.reaching_out;
 	      block_info.probability = curr_block_info.probability;
 	      changed_p = true;
@@ -2294,22 +2601,28 @@  pass_vsetvl::backward_demand_fusion (void)
 		{
 		  if (block_info.reaching_out >= prop)
 		    continue;
-		  block_info.probability += curr_block_info.probability;
 		  new_info = block_info.reaching_out.merge (prop, GLOBAL_MERGE);
+		  new_info.set_dirty (
+		    block_info.reaching_out.dirty_with_killed_avl_p ());
+		  block_info.probability += curr_block_info.probability;
 		}
 	      else
 		{
 		  if (curr_block_info.probability > block_info.probability)
 		    {
+		      enum fusion_type type
+			= get_backward_fusion_type (crtl->ssa->bb (e->src),
+						    prop);
+		      if (type == INVALID_FUSION)
+			continue;
 		      new_info = prop;
+		      new_info.set_dirty (type);
 		      block_info.probability = curr_block_info.probability;
 		    }
 		  else
 		    continue;
 		}
 
-	      new_info.set_dirty (
-		block_info.reaching_out.dirty_with_killed_avl_p ());
 	      block_info.local_dem = new_info;
 	      block_info.reaching_out = new_info;
 	      changed_p = true;
@@ -2319,10 +2632,28 @@  pass_vsetvl::backward_demand_fusion (void)
 	      /* We not only change the info during backward propagation,
 		 but also change the VSETVL instruction.  */
 	      gcc_assert (block_info.reaching_out.valid_p ());
-	      if (!block_info.reaching_out.compatible_p (prop))
-		continue;
-	      if (block_info.reaching_out >= prop)
-		continue;
+	      hash_set<set_info *> sets
+		= get_all_sets (prop.get_avl_source (), true, false, false);
+	      set_info *set = get_same_bb_set (sets, e->src);
+	      if (vsetvl_insn_p (block_info.reaching_out.get_insn ()->rtl ())
+		  && prop.has_avl_reg () && !vlmax_avl_p (prop.get_avl ()))
+		{
+		  if (!block_info.reaching_out.same_vlmax_p (prop))
+		    continue;
+		  if (block_info.reaching_out.same_vtype_p (prop))
+		    continue;
+		  if (!set)
+		    continue;
+		  if (set->insn () != block_info.reaching_out.get_insn ())
+		    continue;
+		}
+	      else
+		{
+		  if (!block_info.reaching_out.compatible_p (prop))
+		    continue;
+		  if (block_info.reaching_out >= prop)
+		    continue;
+		}
 
 	      vector_insn_info be_merged = block_info.reaching_out;
 	      if (block_info.local_dem == block_info.reaching_out)
@@ -2410,8 +2741,8 @@  pass_vsetvl::forward_demand_fusion (void)
 	  if (local_dem.dirty_p ())
 	    {
 	      gcc_assert (local_dem == reaching_out);
+	      new_info.set_dirty (local_dem.dirty_with_killed_avl_p ());
 	      local_dem = new_info;
-	      local_dem.set_dirty (local_dem.dirty_with_killed_avl_p ());
 	      reaching_out = local_dem;
 	    }
 	  else
@@ -2439,9 +2770,6 @@  pass_vsetvl::demand_fusion (void)
   while (changed_p)
     {
       changed_p = false;
-      prune_expressions ();
-      m_vector_manager->create_bitmap_vectors ();
-      compute_local_properties ();
       /* To optimize the case like this:
 	 void f2 (int8_t * restrict in, int8_t * restrict out, int n, int cond)
 	   {
@@ -2475,12 +2803,22 @@  pass_vsetvl::demand_fusion (void)
 	an AVL kill instruction in bb 2 that we can't backward fuse bb 3 or
 	forward bb 1 arbitrarily. We need available information of each block to
 	help for such cases.  */
+      changed_p |= backward_demand_fusion ();
+      changed_p |= forward_demand_fusion ();
+    }
+
+  changed_p = true;
+  while (changed_p)
+    {
+      changed_p = false;
+      prune_expressions ();
+      m_vector_manager->create_bitmap_vectors ();
+      compute_local_properties ();
       compute_available (m_vector_manager->vector_comp,
 			 m_vector_manager->vector_kill,
 			 m_vector_manager->vector_avout,
 			 m_vector_manager->vector_avin);
-      changed_p |= backward_demand_fusion ();
-      changed_p |= forward_demand_fusion ();
+      changed_p |= cleanup_illegal_dirty_blocks ();
       m_vector_manager->free_bitmap_vectors ();
       if (!m_vector_manager->vector_exprs.is_empty ())
 	m_vector_manager->vector_exprs.release ();
@@ -2498,6 +2836,34 @@  pass_vsetvl::demand_fusion (void)
     }
 }
 
+/* Cleanup illegal dirty blocks.  */
+bool
+pass_vsetvl::cleanup_illegal_dirty_blocks (void)
+{
+  bool changed_p = false;
+  for (const bb_info *bb : crtl->ssa->bbs ())
+    {
+      basic_block cfg_bb = bb->cfg_bb ();
+      const auto &prop
+	= m_vector_manager->vector_block_infos[cfg_bb->index].reaching_out;
+
+      /* If there is nothing to cleanup, just skip it.  */
+      if (!prop.valid_or_dirty_p ())
+	continue;
+
+      if (hard_empty_block_p (bb, prop))
+	{
+	  m_vector_manager->vector_block_infos[cfg_bb->index].local_dem
+	    = vector_insn_info::get_hard_empty ();
+	  m_vector_manager->vector_block_infos[cfg_bb->index].reaching_out
+	    = vector_insn_info::get_hard_empty ();
+	  changed_p = true;
+	  continue;
+	}
+    }
+  return changed_p;
+}
+
 /* Assemble the candidates expressions for LCM.  */
 void
 pass_vsetvl::prune_expressions (void)
@@ -2614,18 +2980,20 @@  pass_vsetvl::compute_local_properties (void)
 	     as long as the all real def insn of avl do not come from this
 	     block. This configuration may be still missing some optimization
 	     opportunities.  */
-	  if (reg_killed_by_bb_p (bb, expr->get_avl ()))
+	  if (find_reg_killed_by (bb, expr->get_avl ()))
 	    {
-	      hash_set<insn_info *> insns = get_all_nonphi_defs (
-		safe_dyn_cast<phi_info *> (expr->get_avl_source ()));
-	      if (any_insn_in_bb_p (insns, bb))
+	      hash_set<set_info *> sets
+		= get_all_sets (expr->get_avl_source (), true, false, false);
+	      if (any_set_in_bb_p (sets, bb))
 		bitmap_clear_bit (m_vector_manager->vector_transp[curr_bb_idx],
 				  i);
 	    }
 	}
 
       /* Compute anticipatable occurrences.  */
-      if (local_dem.valid_p () || local_dem.real_dirty_p ())
+      if (local_dem.valid_p () || local_dem.real_dirty_p ()
+	  || (has_vsetvl_killed_avl_p (bb, local_dem)
+	      && vlmax_avl_p (local_dem.get_avl ())))
 	if (anticipatable_occurrence_p (bb, local_dem))
 	  bitmap_set_bit (m_vector_manager->vector_antic[curr_bb_idx],
 			  m_vector_manager->get_expr_id (local_dem));
@@ -2693,7 +3061,8 @@  pass_vsetvl::compute_local_properties (void)
 
 /* Return true if VSETVL in the block can be refined as vsetvl zero,zero.  */
 bool
-pass_vsetvl::can_refine_vsetvl_p (const basic_block cfg_bb, uint8_t ratio) const
+pass_vsetvl::can_refine_vsetvl_p (const basic_block cfg_bb,
+				  const vector_insn_info &info) const
 {
   if (!m_vector_manager->all_same_ratio_p (
 	m_vector_manager->vector_avin[cfg_bb->index]))
@@ -2705,7 +3074,9 @@  pass_vsetvl::can_refine_vsetvl_p (const basic_block cfg_bb, uint8_t ratio) const
 
   size_t expr_id
     = bitmap_first_set_bit (m_vector_manager->vector_avin[cfg_bb->index]);
-  if (m_vector_manager->vector_exprs[expr_id]->get_ratio () != ratio)
+  if (!m_vector_manager->vector_exprs[expr_id]->same_vlmax_p (info))
+    return false;
+  if (!m_vector_manager->vector_exprs[expr_id]->compatible_avl_p (info))
     return false;
 
   edge e;
@@ -2748,7 +3119,7 @@  pass_vsetvl::refine_vsetvls (void) const
 	continue;
 
       rtx_insn *rinsn = insn->rtl ();
-      if (!can_refine_vsetvl_p (cfg_bb, info.get_ratio ()))
+      if (!can_refine_vsetvl_p (cfg_bb, info))
 	continue;
 
       if (!vector_config_insn_p (rinsn))
@@ -2871,7 +3242,7 @@  pass_vsetvl::commit_vsetvls (void)
 	}
 
       rtx new_pat;
-      if (can_refine_vsetvl_p (cfg_bb, reaching_out.get_ratio ()))
+      if (can_refine_vsetvl_p (cfg_bb, reaching_out))
 	new_pat
 	  = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, reaching_out, NULL_RTX);
       else if (vlmax_avl_p (reaching_out.get_avl ()))
@@ -2965,6 +3336,115 @@  pass_vsetvl::cleanup_insns (void) const
     }
 }
 
+void
+pass_vsetvl::propagate_avl (void) const
+{
+  /* Rebuild the RTL_SSA according to the new CFG generated by LCM.  */
+  /* Finalization of RTL_SSA.  */
+  free_dominance_info (CDI_DOMINATORS);
+  if (crtl->ssa->perform_pending_updates ())
+    cleanup_cfg (0);
+  delete crtl->ssa;
+  crtl->ssa = nullptr;
+  /* Initialization of RTL_SSA.  */
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_analyze ();
+  crtl->ssa = new function_info (cfun);
+
+  hash_set<rtx_insn *> to_delete;
+  for (const bb_info *bb : crtl->ssa->bbs ())
+    {
+      for (insn_info *insn : bb->real_nondebug_insns ())
+	{
+	  if (vsetvl_discard_result_insn_p (insn->rtl ()))
+	    {
+	      rtx avl = get_avl (insn->rtl ());
+	      if (!REG_P (avl))
+		continue;
+
+	      set_info *set = find_access (insn->uses (), REGNO (avl))->def ();
+	      insn_info *def_insn = extract_single_source (set);
+	      if (!def_insn)
+		continue;
+
+	      /* Handle this case:
+		 vsetvli	a6,zero,e32,m1,ta,mu
+		 li	a5,4096
+		 add	a7,a0,a5
+		 addi	a7,a7,-96
+		 vsetvli	t1,zero,e8,mf8,ta,ma
+		 vle8.v	v24,0(a7)
+		 add	a5,a3,a5
+		 addi	a5,a5,-96
+		 vse8.v	v24,0(a5)
+		 vsetvli	zero,a6,e32,m1,tu,ma
+	      */
+	      if (vsetvl_insn_p (def_insn->rtl ()))
+		{
+		  vl_vtype_info def_info = get_vl_vtype_info (def_insn);
+		  vl_vtype_info info = get_vl_vtype_info (insn);
+		  rtx avl = get_avl (def_insn->rtl ());
+		  rtx vl = get_vl (def_insn->rtl ());
+		  if (def_info.get_ratio () == info.get_ratio ())
+		    {
+		      if (vlmax_avl_p (def_info.get_avl ()))
+			{
+			  info.set_avl_info (
+			    avl_info (def_info.get_avl (), nullptr));
+			  rtx new_pat
+			    = gen_vsetvl_pat (VSETVL_NORMAL, info, vl);
+			  validate_change (insn->rtl (),
+					   &PATTERN (insn->rtl ()), new_pat,
+					   false);
+			  continue;
+			}
+		      if (def_info.has_avl_imm () || rtx_equal_p (avl, vl))
+			{
+			  info.set_avl_info (avl_info (avl, nullptr));
+			  emit_vsetvl_insn (VSETVL_DISCARD_RESULT, EMIT_AFTER,
+					    info, NULL_RTX, insn->rtl ());
+			  if (set->single_nondebug_insn_use ())
+			    {
+			      to_delete.add (insn->rtl ());
+			      to_delete.add (def_insn->rtl ());
+			    }
+			  continue;
+			}
+		    }
+		}
+	    }
+
+	  /* Change vsetvl rd, rs1 --> vsevl zero, rs1,
+	     if rd is not used by any nondebug instructions.
+	     Even though this PASS runs after RA and it doesn't help for
+	     reduce register pressure, it can help instructions scheduling
+	     since we remove the dependencies.  */
+	  if (vsetvl_insn_p (insn->rtl ()))
+	    {
+	      rtx vl = get_vl (insn->rtl ());
+	      rtx avl = get_avl (insn->rtl ());
+	      if (vlmax_avl_p (avl))
+		continue;
+	      def_info *def = find_access (insn->defs (), REGNO (vl));
+	      set_info *set = safe_dyn_cast<set_info *> (def);
+	      gcc_assert (set);
+	      const vl_vtype_info info = get_vl_vtype_info (insn);
+	      rtx new_pat
+		= gen_vsetvl_pat (VSETVL_DISCARD_RESULT, info, NULL_RTX);
+	      if (!set->has_nondebug_insn_uses ())
+		{
+		  validate_change (insn->rtl (), &PATTERN (insn->rtl ()),
+				   new_pat, false);
+		  continue;
+		}
+	    }
+	}
+    }
+
+  for (rtx_insn *rinsn : to_delete)
+    eliminate_insn (rinsn);
+}
+
 void
 pass_vsetvl::init (void)
 {
@@ -3083,6 +3563,12 @@  pass_vsetvl::lazy_vsetvl (void)
   if (dump_file)
     fprintf (dump_file, "\nPhase 5: Cleanup AVL and VL operands\n");
   cleanup_insns ();
+
+  /* Phase 6 - Rebuild RTL_SSA to propagate AVL between vsetvls.  */
+  if (dump_file)
+    fprintf (dump_file,
+	     "\nPhase 6: Rebuild RTL_SSA to propagate AVL between vsetvls\n");
+  propagate_avl ();
 }
 
 /* Main entry point for this pass.  */
diff --git a/gcc/config/riscv/riscv-vsetvl.h b/gcc/config/riscv/riscv-vsetvl.h
index 3b68bf638ae..4177b3e851b 100644
--- a/gcc/config/riscv/riscv-vsetvl.h
+++ b/gcc/config/riscv/riscv-vsetvl.h
@@ -65,6 +65,21 @@  enum merge_type
   GLOBAL_MERGE
 };
 
+enum def_type
+{
+  REAL_SET = 1 << 0,
+  PHI_SET = 1 << 1,
+  BB_HEAD_SET = 1 << 2,
+  BB_END_SET = 1 << 3,
+  /* ??? TODO: In RTL_SSA framework, we have REAL_SET,
+     PHI_SET, BB_HEAD_SET, BB_END_SET and
+     CLOBBER_DEF def_info types. Currently,
+     we conservatively do not optimize clobber
+     def since we don't see the case that we
+     need to optimize it.  */
+  CLOBBER_DEF = 1 << 4
+};
+
 /* AVL info for RVV instruction. Most RVV instructions have AVL operand in
    implicit dependency. The AVL comparison between 2 RVV instructions is
    very important since it affects our decision whether we should insert
@@ -143,6 +158,7 @@  public:
   rtx get_value () const { return m_value; }
   rtl_ssa::set_info *get_source () const { return m_source; }
   bool single_source_equal_p (const avl_info &) const;
+  bool multiple_source_equal_p (const avl_info &) const;
   avl_info &operator= (const avl_info &);
   bool operator== (const avl_info &) const;
   bool operator!= (const avl_info &) const;
@@ -210,6 +226,8 @@  private:
     VALID,
     UNKNOWN,
     EMPTY,
+    /* The empty block can not be polluted as dirty.  */
+    HARD_EMPTY,
 
     /* The block is polluted as containing VSETVL instruction during dem
        backward propagation to gain better LCM optimization even though
@@ -280,7 +298,8 @@  public:
   bool uninit_p () const { return m_state == UNINITIALIZED; }
   bool valid_p () const { return m_state == VALID; }
   bool unknown_p () const { return m_state == UNKNOWN; }
-  bool empty_p () const { return m_state == EMPTY; }
+  bool empty_p () const { return m_state == EMPTY || m_state == HARD_EMPTY; }
+  bool hard_empty_p () const { return m_state == HARD_EMPTY; }
   bool dirty_p () const
   {
     return m_state == DIRTY || m_state == DIRTY_WITH_KILLED_AVL;
@@ -295,6 +314,7 @@  public:
     return m_state == VALID || m_state == DIRTY
 	   || m_state == DIRTY_WITH_KILLED_AVL;
   }
+  bool available_p (const vector_insn_info &) const;
 
   static vector_insn_info get_unknown ()
   {
@@ -303,9 +323,17 @@  public:
     return info;
   }
 
+  static vector_insn_info get_hard_empty ()
+  {
+    vector_insn_info info;
+    info.set_hard_empty ();
+    return info;
+  }
+
   void set_valid () { m_state = VALID; }
   void set_unknown () { m_state = UNKNOWN; }
   void set_empty () { m_state = EMPTY; }
+  void set_hard_empty () { m_state = HARD_EMPTY; }
   void set_dirty (enum fusion_type type)
   {
     gcc_assert (type == VALID_AVL_FUSION || type == KILLED_AVL_FUSION);