Message ID | 20240513145428.148553-1-pan2.li@intel.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/3] Vect: Support loop len in vectorizable early exit | expand |
> -----Original Message----- > From: pan2.li@intel.com <pan2.li@intel.com> > Sent: Monday, May 13, 2024 3:54 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > Tamar Christina <Tamar.Christina@arm.com>; Richard Sandiford > <Richard.Sandiford@arm.com>; Pan Li <pan2.li@intel.com> > Subject: [PATCH v1 1/3] Vect: Support loop len in vectorizable early exit > > From: Pan Li <pan2.li@intel.com> > > This patch adds early break auto-vectorization support for target which > use length on partial vectorization. Consider this following example: > > unsigned vect_a[802]; > unsigned vect_b[802]; > > void test (unsigned x, int n) > { > for (int i = 0; i < n; i++) > { > vect_b[i] = x + i; > > if (vect_a[i] > x) > break; > > vect_a[i] = x; > } > } > > We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias). > And then the IR of RVV looks like below: > > ... > _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]); > _55 = (int) _87; > ... > mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67; > vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \ > {0, ... }, _87, 0); > if (vec_len_mask_72 != { 0, ... }) > goto <bb 6>; [5.50%] > else > goto <bb 7>; [94.50%] > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 2. The aarch64 fully regression tests. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > gcc/ChangeLog: > > * tree-vect-stmts.cc (vectorizable_early_exit): Add loop len > handling for one or multiple stmt. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/tree-vect-stmts.cc | 47 ++++++++++++++++++++++++++++++++++++++++- > - > 1 file changed, 45 insertions(+), 2 deletions(-) > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 21e8fe98e44..bfd9d66568f 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -12896,7 +12896,9 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > ncopies = vect_get_num_copies (loop_vinfo, vectype); > > vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); > + bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo); > > /* Now build the new conditional. Pattern gimple_conds get dropped during > codegen so we must replace the original insn. */ > @@ -12960,12 +12962,11 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > { > if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype, > OPTIMIZE_FOR_SPEED)) > - return false; > + vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1); > else > vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL); > } > > - > return true; > } > > @@ -13018,6 +13019,25 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > stmts[i], &cond_gsi); > workset.quick_push (stmt_mask); > } > + else if (len_loop_p) > + for (unsigned i = 0; i < stmts.length (); i++) > + { > + tree all_ones_mask = build_all_ones_cst (vectype); > + tree all_zero_mask = build_zero_cst (vectype); > + tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, > + vectype, i, 1); > + signed char cst = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS > (loop_vinfo); > + tree bias = build_int_cst (intQI_type_node, cst); > + tree len_mask > + = make_temp_ssa_name (TREE_TYPE (stmts[i]), NULL, > "vec_len_mask"); > + gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, > + stmts[i], all_ones_mask, > + all_zero_mask, len, bias); > + gimple_call_set_lhs (call, len_mask); > + gsi_insert_before (&cond_gsi, call, GSI_SAME_STMT); > + > + workset.quick_push (len_mask); > + } > else > workset.splice (stmts); > > @@ -13042,6 +13062,29 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, > new_temp, &cond_gsi); > } > + else if (len_loop_p) > + { > + /* len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias) > + > + which is equivalent to: > + > + len_mask = compare_mask mask && i < len ? 1 : 0 > + */ > + tree all_ones_mask = build_all_ones_cst (vectype); > + tree all_zero_mask = build_zero_cst (vectype); > + tree len > + = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, vectype, 0, 1); > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS > (loop_vinfo); > + tree bias = build_int_cst (intQI_type_node, biasval); > + tree len_mask > + = make_temp_ssa_name (TREE_TYPE (new_temp), NULL, > "vec_len_mask"); > + gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, > + new_temp, all_ones_mask, > + all_zero_mask, len, bias); > + gimple_call_set_lhs (call, len_mask); > + gsi_insert_before (&cond_gsi, call, GSI_SAME_STMT); > + new_temp = len_mask; > + } Could we perhaps factor these out to a helper? Something similar to what we have with prepare_vec_mask. It looks like the only difference between these two block of code is the index being passed to vect_get_loop_len. It would be nice to have the shape the same way as the masked_loop_p case, i.e. keep 1 call to get the loop_len and one to build the mask using a helper. Thanks, Tamar > } > > gcc_assert (new_temp); > -- > 2.34.1
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 21e8fe98e44..bfd9d66568f 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12896,7 +12896,9 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, ncopies = vect_get_num_copies (loop_vinfo, vectype); vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo); /* Now build the new conditional. Pattern gimple_conds get dropped during codegen so we must replace the original insn. */ @@ -12960,12 +12962,11 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, { if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype, OPTIMIZE_FOR_SPEED)) - return false; + vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1); else vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL); } - return true; } @@ -13018,6 +13019,25 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, stmts[i], &cond_gsi); workset.quick_push (stmt_mask); } + else if (len_loop_p) + for (unsigned i = 0; i < stmts.length (); i++) + { + tree all_ones_mask = build_all_ones_cst (vectype); + tree all_zero_mask = build_zero_cst (vectype); + tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, + vectype, i, 1); + signed char cst = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + tree bias = build_int_cst (intQI_type_node, cst); + tree len_mask + = make_temp_ssa_name (TREE_TYPE (stmts[i]), NULL, "vec_len_mask"); + gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, + stmts[i], all_ones_mask, + all_zero_mask, len, bias); + gimple_call_set_lhs (call, len_mask); + gsi_insert_before (&cond_gsi, call, GSI_SAME_STMT); + + workset.quick_push (len_mask); + } else workset.splice (stmts); @@ -13042,6 +13062,29 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, new_temp, &cond_gsi); } + else if (len_loop_p) + { + /* len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias) + + which is equivalent to: + + len_mask = compare_mask mask && i < len ? 1 : 0 + */ + tree all_ones_mask = build_all_ones_cst (vectype); + tree all_zero_mask = build_zero_cst (vectype); + tree len + = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, vectype, 0, 1); + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + tree bias = build_int_cst (intQI_type_node, biasval); + tree len_mask + = make_temp_ssa_name (TREE_TYPE (new_temp), NULL, "vec_len_mask"); + gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, + new_temp, all_ones_mask, + all_zero_mask, len, bias); + gimple_call_set_lhs (call, len_mask); + gsi_insert_before (&cond_gsi, call, GSI_SAME_STMT); + new_temp = len_mask; + } } gcc_assert (new_temp);