diff mbox series

[v2,1/3] Vect: Support loop len in vectorizable early exit

Message ID 20240516040542.2734412-1-pan2.li@intel.com
State New
Headers show
Series [v2,1/3] Vect: Support loop len in vectorizable early exit | expand

Commit Message

Li, Pan2 May 16, 2024, 4:05 a.m. UTC
From: Pan Li <pan2.li@intel.com>

This patch adds early break auto-vectorization support for target which
use length on partial vectorization.  Consider this following example:

unsigned vect_a[802];
unsigned vect_b[802];

void test (unsigned x, int n)
{
  for (int i = 0; i < n; i++)
  {
    vect_b[i] = x + i;

    if (vect_a[i] > x)
      break;

    vect_a[i] = x;
  }
}

We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
And then the IR of RVV looks like below:

  ...
  _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
  _55 = (int) _87;
  ...
  mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
  vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
    {0, ... }, _87, 0);
  if (vec_len_mask_72 != { 0, ... })
    goto <bb 6>; [5.50%]
  else
    goto <bb 7>; [94.50%]

The below tests are passed for this patch:
1. The riscv fully regression tests.
2. The x86 bootstrap tests.
3. The x86 fully regression tests.

gcc/ChangeLog:

	* tree-vect-stmts.cc (vectorizable_early_exit): Add loop len
	handling for one or multiple stmt.

gcc/ChangeLog:

	* tree-vect-loop.cc (vect_gen_loop_len_mask): New func to gen
	the loop len mask.
	* tree-vect-stmts.cc (vectorizable_early_exit): Invoke the
	vect_gen_loop_len_mask for 1 or more stmt(s).
	* tree-vectorizer.h (vect_gen_loop_len_mask): New func decl
	for vect_gen_loop_len_mask.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/tree-vect-loop.cc  | 27 +++++++++++++++++++++++++++
 gcc/tree-vect-stmts.cc | 17 +++++++++++++++--
 gcc/tree-vectorizer.h  |  4 ++++
 3 files changed, 46 insertions(+), 2 deletions(-)

Comments

Tamar Christina May 16, 2024, 6:49 a.m. UTC | #1
> -----Original Message-----
> From: pan2.li@intel.com <pan2.li@intel.com>
> Sent: Thursday, May 16, 2024 5:06 AM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina
> <Tamar.Christina@arm.com>; richard.guenther@gmail.com; Richard Sandiford
> <Richard.Sandiford@arm.com>; Pan Li <pan2.li@intel.com>
> Subject: [PATCH v2 1/3] Vect: Support loop len in vectorizable early exit
> 
> From: Pan Li <pan2.li@intel.com>
> 
> This patch adds early break auto-vectorization support for target which
> use length on partial vectorization.  Consider this following example:
> 
> unsigned vect_a[802];
> unsigned vect_b[802];
> 
> void test (unsigned x, int n)
> {
>   for (int i = 0; i < n; i++)
>   {
>     vect_b[i] = x + i;
> 
>     if (vect_a[i] > x)
>       break;
> 
>     vect_a[i] = x;
>   }
> }
> 
> We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
> And then the IR of RVV looks like below:
> 
>   ...
>   _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
>   _55 = (int) _87;
>   ...
>   mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
>   vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
>     {0, ... }, _87, 0);
>   if (vec_len_mask_72 != { 0, ... })
>     goto <bb 6>; [5.50%]
>   else
>     goto <bb 7>; [94.50%]
> 
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The x86 bootstrap tests.
> 3. The x86 fully regression tests.
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-stmts.cc (vectorizable_early_exit): Add loop len
> 	handling for one or multiple stmt.
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop.cc (vect_gen_loop_len_mask): New func to gen
> 	the loop len mask.
> 	* tree-vect-stmts.cc (vectorizable_early_exit): Invoke the
> 	vect_gen_loop_len_mask for 1 or more stmt(s).
> 	* tree-vectorizer.h (vect_gen_loop_len_mask): New func decl
> 	for vect_gen_loop_len_mask.
> 

Thanks, this version looks good to me!

You'll need Richi's review still.

Cheers,
Tamar

> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/tree-vect-loop.cc  | 27 +++++++++++++++++++++++++++
>  gcc/tree-vect-stmts.cc | 17 +++++++++++++++--
>  gcc/tree-vectorizer.h  |  4 ++++
>  3 files changed, 46 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 361aec06488..83c0544b6aa 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -11416,6 +11416,33 @@ vect_get_loop_len (loop_vec_info loop_vinfo,
> gimple_stmt_iterator *gsi,
>    return loop_len;
>  }
> 
> +/* Generate the tree for the loop len mask and return it.  Given the lens,
> +   nvectors, vectype, index and factor to gen the len mask as below.
> +
> +   tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
> +*/
> +tree
> +vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
> +			gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
> +			unsigned int nvectors, tree vectype, tree stmt,
> +			unsigned int index, unsigned int factor)
> +{
> +  tree all_one_mask = build_all_ones_cst (vectype);
> +  tree all_zero_mask = build_zero_cst (vectype);
> +  tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
> +				factor);
> +  tree bias = build_int_cst (intQI_type_node,
> +			     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo));
> +  tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL,
> "vec_len_mask");
> +  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
> +					    all_one_mask, all_zero_mask, len,
> +					    bias);
> +  gimple_call_set_lhs (call, len_mask);
> +  gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
> +
> +  return len_mask;
> +}
> +
>  /* Scale profiling counters by estimation for LOOP which is vectorized
>     by factor VF.
>     If FLAT is true, the loop we started with had unrealistically flat
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index b8a71605f1b..672959501bb 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -12895,7 +12895,9 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>      ncopies = vect_get_num_copies (loop_vinfo, vectype);
> 
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> +  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
> 
>    /* Now build the new conditional.  Pattern gimple_conds get dropped during
>       codegen so we must replace the original insn.  */
> @@ -12959,12 +12961,11 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>  	{
>  	  if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
>  					      OPTIMIZE_FOR_SPEED))
> -	    return false;
> +	    vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
>  	  else
>  	    vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
>  	}
> 
> -
>        return true;
>      }
> 
> @@ -13017,6 +13018,15 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>  				  stmts[i], &cond_gsi);
>  	    workset.quick_push (stmt_mask);
>  	  }
> +      else if (len_loop_p)
> +	for (unsigned i = 0; i < stmts.length (); i++)
> +	  {
> +	    tree len_mask = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi,
> +						    lens, ncopies, vectype,
> +						    stmts[i], i, 1);
> +
> +	    workset.quick_push (len_mask);
> +	  }
>        else
>  	workset.splice (stmts);
> 
> @@ -13041,6 +13051,9 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>  	  new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
>  				       new_temp, &cond_gsi);
>  	}
> +      else if (len_loop_p)
> +	new_temp = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
> +					   ncopies, vectype, new_temp, 0, 1);
>      }
> 
>    gcc_assert (new_temp);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index db44d730b70..93bc30ef660 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2408,6 +2408,10 @@ extern void vect_record_loop_len (loop_vec_info,
> vec_loop_lens *, unsigned int,
>  extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
>  			       vec_loop_lens *, unsigned int, tree,
>  			       unsigned int, unsigned int);
> +extern tree vect_gen_loop_len_mask (loop_vec_info, gimple_stmt_iterator *,
> +				    gimple_stmt_iterator *, vec_loop_lens *,
> +				    unsigned int, tree, tree, unsigned int,
> +				    unsigned int);
>  extern gimple_seq vect_gen_len (tree, tree, tree, tree);
>  extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
>  extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> --
> 2.34.1
Richard Biener May 16, 2024, 12:13 p.m. UTC | #2
On Thu, May 16, 2024 at 8:50 AM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: pan2.li@intel.com <pan2.li@intel.com>
> > Sent: Thursday, May 16, 2024 5:06 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina
> > <Tamar.Christina@arm.com>; richard.guenther@gmail.com; Richard Sandiford
> > <Richard.Sandiford@arm.com>; Pan Li <pan2.li@intel.com>
> > Subject: [PATCH v2 1/3] Vect: Support loop len in vectorizable early exit
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > This patch adds early break auto-vectorization support for target which
> > use length on partial vectorization.  Consider this following example:
> >
> > unsigned vect_a[802];
> > unsigned vect_b[802];
> >
> > void test (unsigned x, int n)
> > {
> >   for (int i = 0; i < n; i++)
> >   {
> >     vect_b[i] = x + i;
> >
> >     if (vect_a[i] > x)
> >       break;
> >
> >     vect_a[i] = x;
> >   }
> > }
> >
> > We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
> > And then the IR of RVV looks like below:
> >
> >   ...
> >   _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
> >   _55 = (int) _87;
> >   ...
> >   mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
> >   vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
> >     {0, ... }, _87, 0);
> >   if (vec_len_mask_72 != { 0, ... })
> >     goto <bb 6>; [5.50%]
> >   else
> >     goto <bb 7>; [94.50%]
> >
> > The below tests are passed for this patch:
> > 1. The riscv fully regression tests.
> > 2. The x86 bootstrap tests.
> > 3. The x86 fully regression tests.
> >
> > gcc/ChangeLog:
> >
> >       * tree-vect-stmts.cc (vectorizable_early_exit): Add loop len
> >       handling for one or multiple stmt.
> >
> > gcc/ChangeLog:
> >
> >       * tree-vect-loop.cc (vect_gen_loop_len_mask): New func to gen
> >       the loop len mask.
> >       * tree-vect-stmts.cc (vectorizable_early_exit): Invoke the
> >       vect_gen_loop_len_mask for 1 or more stmt(s).
> >       * tree-vectorizer.h (vect_gen_loop_len_mask): New func decl
> >       for vect_gen_loop_len_mask.
> >
>
> Thanks, this version looks good to me!
>
> You'll need Richi's review still.

OK.

Thanks,
Richard.

> Cheers,
> Tamar
>
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/tree-vect-loop.cc  | 27 +++++++++++++++++++++++++++
> >  gcc/tree-vect-stmts.cc | 17 +++++++++++++++--
> >  gcc/tree-vectorizer.h  |  4 ++++
> >  3 files changed, 46 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 361aec06488..83c0544b6aa 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -11416,6 +11416,33 @@ vect_get_loop_len (loop_vec_info loop_vinfo,
> > gimple_stmt_iterator *gsi,
> >    return loop_len;
> >  }
> >
> > +/* Generate the tree for the loop len mask and return it.  Given the lens,
> > +   nvectors, vectype, index and factor to gen the len mask as below.
> > +
> > +   tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
> > +*/
> > +tree
> > +vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
> > +                     gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
> > +                     unsigned int nvectors, tree vectype, tree stmt,
> > +                     unsigned int index, unsigned int factor)
> > +{
> > +  tree all_one_mask = build_all_ones_cst (vectype);
> > +  tree all_zero_mask = build_zero_cst (vectype);
> > +  tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
> > +                             factor);
> > +  tree bias = build_int_cst (intQI_type_node,
> > +                          LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> > (loop_vinfo));
> > +  tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL,
> > "vec_len_mask");
> > +  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
> > +                                         all_one_mask, all_zero_mask, len,
> > +                                         bias);
> > +  gimple_call_set_lhs (call, len_mask);
> > +  gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
> > +
> > +  return len_mask;
> > +}
> > +
> >  /* Scale profiling counters by estimation for LOOP which is vectorized
> >     by factor VF.
> >     If FLAT is true, the loop we started with had unrealistically flat
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index b8a71605f1b..672959501bb 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -12895,7 +12895,9 @@ vectorizable_early_exit (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> >      ncopies = vect_get_num_copies (loop_vinfo, vectype);
> >
> >    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> > +  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> >    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> > +  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
> >
> >    /* Now build the new conditional.  Pattern gimple_conds get dropped during
> >       codegen so we must replace the original insn.  */
> > @@ -12959,12 +12961,11 @@ vectorizable_early_exit (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> >       {
> >         if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
> >                                             OPTIMIZE_FOR_SPEED))
> > -         return false;
> > +         vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
> >         else
> >           vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
> >       }
> >
> > -
> >        return true;
> >      }
> >
> > @@ -13017,6 +13018,15 @@ vectorizable_early_exit (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> >                                 stmts[i], &cond_gsi);
> >           workset.quick_push (stmt_mask);
> >         }
> > +      else if (len_loop_p)
> > +     for (unsigned i = 0; i < stmts.length (); i++)
> > +       {
> > +         tree len_mask = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi,
> > +                                                 lens, ncopies, vectype,
> > +                                                 stmts[i], i, 1);
> > +
> > +         workset.quick_push (len_mask);
> > +       }
> >        else
> >       workset.splice (stmts);
> >
> > @@ -13041,6 +13051,9 @@ vectorizable_early_exit (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> >         new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
> >                                      new_temp, &cond_gsi);
> >       }
> > +      else if (len_loop_p)
> > +     new_temp = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
> > +                                        ncopies, vectype, new_temp, 0, 1);
> >      }
> >
> >    gcc_assert (new_temp);
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index db44d730b70..93bc30ef660 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2408,6 +2408,10 @@ extern void vect_record_loop_len (loop_vec_info,
> > vec_loop_lens *, unsigned int,
> >  extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
> >                              vec_loop_lens *, unsigned int, tree,
> >                              unsigned int, unsigned int);
> > +extern tree vect_gen_loop_len_mask (loop_vec_info, gimple_stmt_iterator *,
> > +                                 gimple_stmt_iterator *, vec_loop_lens *,
> > +                                 unsigned int, tree, tree, unsigned int,
> > +                                 unsigned int);
> >  extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> >  extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> >  extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> > --
> > 2.34.1
>
Li, Pan2 May 16, 2024, 12:27 p.m. UTC | #3
Committed, thanks Richard.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Thursday, May 16, 2024 8:13 PM
To: Tamar Christina <Tamar.Christina@arm.com>
Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Richard Sandiford <Richard.Sandiford@arm.com>
Subject: Re: [PATCH v2 1/3] Vect: Support loop len in vectorizable early exit

On Thu, May 16, 2024 at 8:50 AM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: pan2.li@intel.com <pan2.li@intel.com>
> > Sent: Thursday, May 16, 2024 5:06 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina
> > <Tamar.Christina@arm.com>; richard.guenther@gmail.com; Richard Sandiford
> > <Richard.Sandiford@arm.com>; Pan Li <pan2.li@intel.com>
> > Subject: [PATCH v2 1/3] Vect: Support loop len in vectorizable early exit
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > This patch adds early break auto-vectorization support for target which
> > use length on partial vectorization.  Consider this following example:
> >
> > unsigned vect_a[802];
> > unsigned vect_b[802];
> >
> > void test (unsigned x, int n)
> > {
> >   for (int i = 0; i < n; i++)
> >   {
> >     vect_b[i] = x + i;
> >
> >     if (vect_a[i] > x)
> >       break;
> >
> >     vect_a[i] = x;
> >   }
> > }
> >
> > We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
> > And then the IR of RVV looks like below:
> >
> >   ...
> >   _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
> >   _55 = (int) _87;
> >   ...
> >   mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
> >   vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
> >     {0, ... }, _87, 0);
> >   if (vec_len_mask_72 != { 0, ... })
> >     goto <bb 6>; [5.50%]
> >   else
> >     goto <bb 7>; [94.50%]
> >
> > The below tests are passed for this patch:
> > 1. The riscv fully regression tests.
> > 2. The x86 bootstrap tests.
> > 3. The x86 fully regression tests.
> >
> > gcc/ChangeLog:
> >
> >       * tree-vect-stmts.cc (vectorizable_early_exit): Add loop len
> >       handling for one or multiple stmt.
> >
> > gcc/ChangeLog:
> >
> >       * tree-vect-loop.cc (vect_gen_loop_len_mask): New func to gen
> >       the loop len mask.
> >       * tree-vect-stmts.cc (vectorizable_early_exit): Invoke the
> >       vect_gen_loop_len_mask for 1 or more stmt(s).
> >       * tree-vectorizer.h (vect_gen_loop_len_mask): New func decl
> >       for vect_gen_loop_len_mask.
> >
>
> Thanks, this version looks good to me!
>
> You'll need Richi's review still.

OK.

Thanks,
Richard.

> Cheers,
> Tamar
>
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/tree-vect-loop.cc  | 27 +++++++++++++++++++++++++++
> >  gcc/tree-vect-stmts.cc | 17 +++++++++++++++--
> >  gcc/tree-vectorizer.h  |  4 ++++
> >  3 files changed, 46 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 361aec06488..83c0544b6aa 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -11416,6 +11416,33 @@ vect_get_loop_len (loop_vec_info loop_vinfo,
> > gimple_stmt_iterator *gsi,
> >    return loop_len;
> >  }
> >
> > +/* Generate the tree for the loop len mask and return it.  Given the lens,
> > +   nvectors, vectype, index and factor to gen the len mask as below.
> > +
> > +   tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
> > +*/
> > +tree
> > +vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
> > +                     gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
> > +                     unsigned int nvectors, tree vectype, tree stmt,
> > +                     unsigned int index, unsigned int factor)
> > +{
> > +  tree all_one_mask = build_all_ones_cst (vectype);
> > +  tree all_zero_mask = build_zero_cst (vectype);
> > +  tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
> > +                             factor);
> > +  tree bias = build_int_cst (intQI_type_node,
> > +                          LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> > (loop_vinfo));
> > +  tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL,
> > "vec_len_mask");
> > +  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
> > +                                         all_one_mask, all_zero_mask, len,
> > +                                         bias);
> > +  gimple_call_set_lhs (call, len_mask);
> > +  gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
> > +
> > +  return len_mask;
> > +}
> > +
> >  /* Scale profiling counters by estimation for LOOP which is vectorized
> >     by factor VF.
> >     If FLAT is true, the loop we started with had unrealistically flat
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index b8a71605f1b..672959501bb 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -12895,7 +12895,9 @@ vectorizable_early_exit (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> >      ncopies = vect_get_num_copies (loop_vinfo, vectype);
> >
> >    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> > +  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> >    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> > +  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
> >
> >    /* Now build the new conditional.  Pattern gimple_conds get dropped during
> >       codegen so we must replace the original insn.  */
> > @@ -12959,12 +12961,11 @@ vectorizable_early_exit (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> >       {
> >         if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
> >                                             OPTIMIZE_FOR_SPEED))
> > -         return false;
> > +         vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
> >         else
> >           vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
> >       }
> >
> > -
> >        return true;
> >      }
> >
> > @@ -13017,6 +13018,15 @@ vectorizable_early_exit (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> >                                 stmts[i], &cond_gsi);
> >           workset.quick_push (stmt_mask);
> >         }
> > +      else if (len_loop_p)
> > +     for (unsigned i = 0; i < stmts.length (); i++)
> > +       {
> > +         tree len_mask = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi,
> > +                                                 lens, ncopies, vectype,
> > +                                                 stmts[i], i, 1);
> > +
> > +         workset.quick_push (len_mask);
> > +       }
> >        else
> >       workset.splice (stmts);
> >
> > @@ -13041,6 +13051,9 @@ vectorizable_early_exit (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> >         new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
> >                                      new_temp, &cond_gsi);
> >       }
> > +      else if (len_loop_p)
> > +     new_temp = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
> > +                                        ncopies, vectype, new_temp, 0, 1);
> >      }
> >
> >    gcc_assert (new_temp);
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index db44d730b70..93bc30ef660 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2408,6 +2408,10 @@ extern void vect_record_loop_len (loop_vec_info,
> > vec_loop_lens *, unsigned int,
> >  extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
> >                              vec_loop_lens *, unsigned int, tree,
> >                              unsigned int, unsigned int);
> > +extern tree vect_gen_loop_len_mask (loop_vec_info, gimple_stmt_iterator *,
> > +                                 gimple_stmt_iterator *, vec_loop_lens *,
> > +                                 unsigned int, tree, tree, unsigned int,
> > +                                 unsigned int);
> >  extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> >  extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> >  extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
> > --
> > 2.34.1
>
diff mbox series

Patch

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 361aec06488..83c0544b6aa 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -11416,6 +11416,33 @@  vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
   return loop_len;
 }
 
+/* Generate the tree for the loop len mask and return it.  Given the lens,
+   nvectors, vectype, index and factor to gen the len mask as below.
+
+   tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
+*/
+tree
+vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+			gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
+			unsigned int nvectors, tree vectype, tree stmt,
+			unsigned int index, unsigned int factor)
+{
+  tree all_one_mask = build_all_ones_cst (vectype);
+  tree all_zero_mask = build_zero_cst (vectype);
+  tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
+				factor);
+  tree bias = build_int_cst (intQI_type_node,
+			     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
+  tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
+  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
+					    all_one_mask, all_zero_mask, len,
+					    bias);
+  gimple_call_set_lhs (call, len_mask);
+  gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
+
+  return len_mask;
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
    by factor VF.
    If FLAT is true, the loop we started with had unrealistically flat
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b8a71605f1b..672959501bb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12895,7 +12895,9 @@  vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
     ncopies = vect_get_num_copies (loop_vinfo, vectype);
 
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* Now build the new conditional.  Pattern gimple_conds get dropped during
      codegen so we must replace the original insn.  */
@@ -12959,12 +12961,11 @@  vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 	{
 	  if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
 					      OPTIMIZE_FOR_SPEED))
-	    return false;
+	    vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
 	  else
 	    vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
 	}
 
-
       return true;
     }
 
@@ -13017,6 +13018,15 @@  vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 				  stmts[i], &cond_gsi);
 	    workset.quick_push (stmt_mask);
 	  }
+      else if (len_loop_p)
+	for (unsigned i = 0; i < stmts.length (); i++)
+	  {
+	    tree len_mask = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi,
+						    lens, ncopies, vectype,
+						    stmts[i], i, 1);
+
+	    workset.quick_push (len_mask);
+	  }
       else
 	workset.splice (stmts);
 
@@ -13041,6 +13051,9 @@  vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 	  new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
 				       new_temp, &cond_gsi);
 	}
+      else if (len_loop_p)
+	new_temp = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
+					   ncopies, vectype, new_temp, 0, 1);
     }
 
   gcc_assert (new_temp);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index db44d730b70..93bc30ef660 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2408,6 +2408,10 @@  extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
 extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
 			       vec_loop_lens *, unsigned int, tree,
 			       unsigned int, unsigned int);
+extern tree vect_gen_loop_len_mask (loop_vec_info, gimple_stmt_iterator *,
+				    gimple_stmt_iterator *, vec_loop_lens *,
+				    unsigned int, tree, tree, unsigned int,
+				    unsigned int);
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
 extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);