@@ -13389,6 +13389,13 @@ by the copy loop headers pass.
@item vect-epilogues-nomask
Enable loop epilogue vectorization using smaller vector size.
+@item vect-with-length-scope
+Control the scope of vector memory access with length exploitation. 0 means we
+don't expliot any vector memory access with length, 1 means we only exploit
+vector memory access with length for those loops whose iteration number are
+less than VF, such as very small loop or epilogue, 2 means we want to exploit
+vector memory access with length for any loops if possible.
+
@item slp-max-insns-in-bb
Maximum number of instructions in basic block to be
considered for SLP vectorization.
@@ -968,4 +968,8 @@ Bound on number of runtime checks inserted by the vectorizer's loop versioning f
Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
+-param=vect-with-length-scope=
+Common Joined UInteger Var(param_vect_with_length_scope) Init(0) IntegerRange(0, 2) Param Optimization
+Control the vector with length exploitation scope.
+
; This comment is to ensure we retain the blank line above.
@@ -399,19 +399,20 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
It is known that:
- NITERS * RGC->max_nscalars_per_iter
+ NITERS * RGC->max_nscalars_per_iter * RGC->factor
does not overflow. However, MIGHT_WRAP_P says whether an induction
variable that starts at 0 and has step:
- VF * RGC->max_nscalars_per_iter
+ VF * RGC->max_nscalars_per_iter * RGC->factor
might overflow before hitting a value above:
- (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter
+ (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
This means that we cannot guarantee that such an induction variable
- would ever hit a value that produces a set of all-false masks for RGC. */
+ would ever hit a value that produces a set of all-false masks or zero
+ lengths for RGC. */
static tree
vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
@@ -422,10 +423,20 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
{
tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ bool vect_for_masking = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+
tree ctrl_type = rgc->type;
- unsigned int nscalars_per_iter = rgc->max_nscalars_per_iter;
+ /* Scale up nscalars per iteration with factor. */
+ unsigned int nscalars_per_iter_ft = rgc->max_nscalars_per_iter * rgc->factor;
poly_uint64 nscalars_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ tree length_limit = NULL_TREE;
+ /* For length, we probably need length_limit to check length in range. */
+ if (!vect_for_masking)
+ {
+ poly_uint64 len_limit = nscalars_per_ctrl * rgc->factor;
+ length_limit = build_int_cst (compare_type, len_limit);
+ }
/* Calculate the maximum number of scalar values that the rgroup
handles in total, the number that it handles for each iteration
@@ -434,12 +445,12 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
tree nscalars_total = niters;
tree nscalars_step = build_int_cst (iv_type, vf);
tree nscalars_skip = niters_skip;
- if (nscalars_per_iter != 1)
+ if (nscalars_per_iter_ft != 1)
{
/* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
these multiplications don't overflow. */
- tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
- tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
+ tree compare_factor = build_int_cst (compare_type, nscalars_per_iter_ft);
+ tree iv_factor = build_int_cst (iv_type, nscalars_per_iter_ft);
nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
nscalars_total, compare_factor);
nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
@@ -509,7 +520,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
NSCALARS_SKIP to that cannot overflow. */
tree const_limit = build_int_cst (compare_type,
LOOP_VINFO_VECT_FACTOR (loop_vinfo)
- * nscalars_per_iter);
+ * nscalars_per_iter_ft);
first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
nscalars_total, const_limit);
first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
@@ -549,16 +560,16 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
{
/* Previous controls will cover BIAS scalars. This control covers the
next batch. */
- poly_uint64 bias = nscalars_per_ctrl * i;
+ poly_uint64 batch_nscalars_ft = nscalars_per_ctrl * rgc->factor;
+ poly_uint64 bias = batch_nscalars_ft * i;
tree bias_tree = build_int_cst (compare_type, bias);
- gimple *tmp_stmt;
/* See whether the first iteration of the vector loop is known
to have a full control. */
poly_uint64 const_limit;
bool first_iteration_full
= (poly_int_tree_p (first_limit, &const_limit)
- && known_ge (const_limit, (i + 1) * nscalars_per_ctrl));
+ && known_ge (const_limit, (i + 1) * batch_nscalars_ft));
/* Rather than have a new IV that starts at BIAS and goes up to
TEST_LIMIT, prefer to use the same 0-based IV for each control
@@ -598,9 +609,19 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
end = first_limit;
}
- init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
- tmp_stmt = vect_gen_while (init_ctrl, start, end);
- gimple_seq_add_stmt (preheader_seq, tmp_stmt);
+ if (vect_for_masking)
+ {
+ init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
+ gimple *tmp_stmt = vect_gen_while (init_ctrl, start, end);
+ gimple_seq_add_stmt (preheader_seq, tmp_stmt);
+ }
+ else
+ {
+ init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
+ gimple_seq seq = vect_gen_len (init_ctrl, start,
+ end, length_limit);
+ gimple_seq_add_seq (preheader_seq, seq);
+ }
}
/* Now AND out the bits that are within the number of skipped
@@ -617,16 +638,32 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
init_ctrl, unskipped_mask);
else
init_ctrl = unskipped_mask;
+ gcc_assert (vect_for_masking);
}
+ /* First iteration is full. */
if (!init_ctrl)
- /* First iteration is full. */
- init_ctrl = build_minus_one_cst (ctrl_type);
+ {
+ if (vect_for_masking)
+ init_ctrl = build_minus_one_cst (ctrl_type);
+ else
+ init_ctrl = length_limit;
+ }
/* Get the control value for the next iteration of the loop. */
- next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
- gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
- gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
+ if (vect_for_masking)
+ {
+ next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
+ gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
+ gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
+ }
+ else
+ {
+ next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
+ gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
+ length_limit);
+ gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
+ }
vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
}
@@ -652,6 +689,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
gimple_seq preheader_seq = NULL;
gimple_seq header_seq = NULL;
+ bool vect_for_masking = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
unsigned int compare_precision = TYPE_PRECISION (compare_type);
tree orig_niters = niters;
@@ -686,28 +724,30 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
tree test_ctrl = NULL_TREE;
rgroup_controls *rgc;
unsigned int i;
- vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
- FOR_EACH_VEC_ELT (*masks, i, rgc)
+ auto_vec<rgroup_controls> *controls = vect_for_masking
+ ? &LOOP_VINFO_MASKS (loop_vinfo)
+ : &LOOP_VINFO_LENS (loop_vinfo);
+ FOR_EACH_VEC_ELT (*controls, i, rgc)
if (!rgc->controls.is_empty ())
{
/* First try using permutes. This adds a single vector
instruction to the loop for each mask, but needs no extra
loop invariants or IVs. */
unsigned int nmasks = i + 1;
- if ((nmasks & 1) == 0)
+ if (vect_for_masking && (nmasks & 1) == 0)
{
- rgroup_controls *half_rgc = &(*masks)[nmasks / 2 - 1];
+ rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
if (!half_rgc->controls.is_empty ()
&& vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
continue;
}
/* See whether zero-based IV would ever generate all-false masks
- before wrapping around. */
+ or zero length before wrapping around. */
+ unsigned nscalars_ft = rgc->max_nscalars_per_iter * rgc->factor;
bool might_wrap_p
= (iv_limit == -1
- || (wi::min_precision (iv_limit * rgc->max_nscalars_per_iter,
- UNSIGNED)
+ || (wi::min_precision (iv_limit * nscalars_ft, UNSIGNED)
> compare_precision));
/* Set up all controls for this group. */
@@ -2568,7 +2608,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
if (vect_epilogues
&& LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
&& prolog_peeling >= 0
- && known_eq (vf, lowest_vf))
+ && known_eq (vf, lowest_vf)
+ && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (epilogue_vinfo))
{
unsigned HOST_WIDE_INT eiters
= (LOOP_VINFO_INT_NITERS (loop_vinfo)
@@ -816,6 +816,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
vectorizable (false),
can_use_partial_vectors_p (true),
using_partial_vectors_p (false),
+ epil_using_partial_vectors_p (false),
peeling_for_gaps (false),
peeling_for_niter (false),
no_data_dependencies (false),
@@ -898,6 +899,7 @@ _loop_vec_info::~_loop_vec_info ()
free (bbs);
release_vec_loop_controls (&masks);
+ release_vec_loop_controls (&lens);
delete ivexpr_map;
delete scan_map;
epilogue_vinfos.release ();
@@ -1072,6 +1074,88 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
return true;
}
+/* Check whether we can use vector access with length based on precison
+ comparison. So far, to keep it simple, we only allow the case that the
+ precision of the target supported length is larger than the precision
+ required by loop niters. */
+
+static bool
+vect_verify_loop_lens (loop_vec_info loop_vinfo)
+{
+ vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+
+ if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+ return false;
+
+ /* The one which has the largest NV should have max bytes per iter. */
+ rgroup_controls *rgl = &(*lens)[lens->length () - 1];
+
+ /* Work out how many bits we need to represent the length limit. */
+ unsigned int nscalars_per_iter_ft = rgl->max_nscalars_per_iter * rgl->factor;
+ unsigned int min_ni_prec
+ = vect_min_prec_for_max_niters (loop_vinfo, nscalars_per_iter_ft);
+
+ /* Now use the maximum of below precisions for one suitable IV type:
+ - the IV's natural precision
+ - the precision needed to hold: the maximum number of scalar
+ iterations multiplied by the scale factor (min_ni_prec above)
+ - the Pmode precision
+ */
+
+ /* If min_ni_width is less than the precision of the current niters,
+ we perfer to still use the niters type. */
+ unsigned int ni_prec
+ = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
+ /* Prefer to use Pmode and wider IV to avoid narrow conversions. */
+ unsigned int pmode_prec = GET_MODE_BITSIZE (Pmode);
+
+ unsigned int required_prec = ni_prec;
+ if (required_prec < pmode_prec)
+ required_prec = pmode_prec;
+
+ tree iv_type = NULL_TREE;
+ if (min_ni_prec > required_prec)
+ {
+ opt_scalar_int_mode tmode_iter;
+ unsigned standard_bits = 0;
+ FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
+ {
+ scalar_mode tmode = tmode_iter.require ();
+ unsigned int tbits = GET_MODE_BITSIZE (tmode);
+
+ /* ??? Do we really want to construct one IV whose precision exceeds
+ BITS_PER_WORD? */
+ if (tbits > BITS_PER_WORD)
+ break;
+
+ /* Find the first available standard integral type. */
+ if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
+ {
+ standard_bits = tbits;
+ break;
+ }
+ }
+ if (standard_bits != 0)
+ iv_type = build_nonstandard_integer_type (standard_bits, true);
+ }
+ else
+ iv_type = build_nonstandard_integer_type (required_prec, true);
+
+ if (!iv_type)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't vectorize with length-based partial vectors"
+ " due to no suitable iv type.\n");
+ return false;
+ }
+
+ LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
+ LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+
+ return true;
+}
+
/* Calculate the cost of one scalar iteration of the loop. */
static void
vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
@@ -2170,11 +2254,64 @@ start_over:
return ok;
}
- /* Decide whether to use a fully-masked loop for this vectorization
- factor. */
- LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
- = (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
- && vect_verify_full_masking (loop_vinfo));
+ /* For now, we don't expect to mix both masking and length approaches for one
+ loop, disable it if both are recorded. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+ && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't vectorize a loop with partial vectors"
+ " because we don't expect to mix different"
+ " approaches with partial vectors for the"
+ " same loop.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+
+ /* Decide whether to vectorize a loop with partial vectors for
+ this vectorization factor. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ /* Decide whether to use fully-masked approach. */
+ if (vect_verify_full_masking (loop_vinfo))
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+ /* Decide whether to use length-based approach. */
+ else if (vect_verify_loop_lens (loop_vinfo))
+ {
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't vectorize this loop with length-based"
+ " partial vectors approach becuase peeling"
+ " for alignment or gaps is required.\n");
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else if (param_vect_with_length_scope == 0)
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ /* The epilogue and other known niters less than VF
+ cases can still use vector access with length fully. */
+ else if (param_vect_with_length_scope == 1
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && !vect_known_niters_smaller_than_vf (loop_vinfo))
+ {
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+ }
+ else
+ {
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+ LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ }
+ else
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+
if (dump_enabled_p ())
{
if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
@@ -2183,6 +2320,15 @@ start_over:
else
dump_printf_loc (MSG_NOTE, vect_location,
"not using a fully-masked loop.\n");
+
+ if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "using length-based partial"
+ " vectors for loop fully.\n");
+ else
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "not using length-based partial"
+ " vectors for loop fully.\n");
}
/* If epilog loop is required because of data accesses with gaps,
@@ -2406,6 +2552,7 @@ again:
= init_cost (LOOP_VINFO_LOOP (loop_vinfo));
/* Reset accumulated rgroup information. */
release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+ release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
/* Reset assorted flags. */
LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
@@ -2692,7 +2839,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
lowest_th = ordered_min (lowest_th, th);
}
else
- delete loop_vinfo;
+ {
+ delete loop_vinfo;
+ loop_vinfo = opt_loop_vec_info::success (NULL);
+ }
/* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
enabled, SIMDUID is not set, it is the innermost loop and we have
@@ -2717,6 +2867,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
else
{
delete loop_vinfo;
+ loop_vinfo = opt_loop_vec_info::success (NULL);
if (fatal)
{
gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2724,6 +2875,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
}
}
+ /* Handle the case that the original loop can use partial
+ vectorization, but want to only adopt it for the epilogue.
+ The retry should be in the same mode as original. */
+ if (vect_epilogues
+ && loop_vinfo
+ && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "***** Re-trying analysis with same vector mode"
+ " %s for epilogue with partial vectors.\n",
+ GET_MODE_NAME (loop_vinfo->vector_mode));
+ continue;
+ }
+
if (mode_i < vector_modes.length ()
&& VECTOR_MODE_P (autodetected_vector_mode)
&& (related_vector_mode (vector_modes[mode_i],
@@ -3564,6 +3732,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
target_cost_data, num_masks - 1, vector_stmt,
NULL, NULL_TREE, 0, vect_body);
}
+ else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+ peel_iters_prologue = 0;
+ peel_iters_epilogue = 0;
+ }
else if (npeel < 0)
{
peel_iters_prologue = assumed_vf / 2;
@@ -8197,6 +8370,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
{
rgm->max_nscalars_per_iter = nscalars_per_iter;
rgm->type = truth_type_for (vectype);
+ rgm->factor = 1;
}
}
@@ -8249,6 +8423,64 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
return mask;
}
+/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
+ lengths for vector access with length that each control a vector of type
+ VECTYPE. */
+
+void
+vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+ unsigned int nvectors, tree vectype)
+{
+ gcc_assert (nvectors != 0);
+ if (lens->length () < nvectors)
+ lens->safe_grow_cleared (nvectors);
+ rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+ /* The number of scalars per iteration, scalar occupied bytes and
+ the number of vectors are both compile-time constants. */
+ unsigned int nscalars_per_iter
+ = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+ if (rgl->max_nscalars_per_iter < nscalars_per_iter)
+ {
+ rgl->max_nscalars_per_iter = nscalars_per_iter;
+ rgl->type = vectype;
+ /* For now, the length-based is for length in bytes.
+ FIXME if length-based supports more eg: length in scalar counts. */
+ rgl->factor = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+ }
+}
+
+/* Given a complete set of length LENS, extract length number INDEX for an
+ rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
+
+tree
+vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+ unsigned int nvectors, unsigned int index)
+{
+ rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+ /* Populate the rgroup's len array, if this is the first time we've
+ used it. */
+ if (rgl->controls.is_empty ())
+ {
+ rgl->controls.safe_grow_cleared (nvectors);
+ for (unsigned int i = 0; i < nvectors; ++i)
+ {
+ tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ gcc_assert (len_type != NULL_TREE);
+ tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
+
+ /* Provide a dummy definition until the real one is available. */
+ SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
+ rgl->controls[i] = len;
+ }
+ }
+
+ return rgl->controls[index];
+}
+
/* Scale profiling counters by estimation for LOOP which is vectorized
by factor VF. */
@@ -1742,29 +1742,56 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
return;
}
- machine_mode mask_mode;
- if (!VECTOR_MODE_P (vecmode)
- || !targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
- || !can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
+ if (!VECTOR_MODE_P (vecmode))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because the target"
- " doesn't have the appropriate masked load or"
- " store.\n");
+ "can't operate on partial vectors because of"
+ " the unexpected mode.\n");
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
return;
}
- /* We might load more scalars than we need for permuting SLP loads.
- We checked in get_group_load_store_type that the extra elements
- don't leak into a new vector. */
+
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
unsigned int nvectors;
- if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
- vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
- else
- gcc_unreachable ();
+
+ machine_mode mask_mode;
+ bool with_partial_vectors_p = false;
+ if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
+ && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
+ {
+ /* We might load more scalars than we need for permuting SLP loads.
+ We checked in get_group_load_store_type that the extra elements
+ don't leak into a new vector. */
+ if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
+ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
+ scalar_mask);
+ else
+ gcc_unreachable ();
+ with_partial_vectors_p = true;
+ }
+
+ optab op = is_load ? lenload_optab : lenstore_optab;
+ if (optab_handler (op, vecmode) != CODE_FOR_nothing)
+ {
+ vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+ if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
+ vect_record_loop_len (loop_vinfo, lens, nvectors, vectype);
+ else
+ gcc_unreachable ();
+ with_partial_vectors_p = true;
+ }
+
+ if (!with_partial_vectors_p)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't operate on partial vectors because the"
+ " target doesn't have the appropriate partial"
+ "vectorization load or store.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
}
/* Return the mask input to a masked load or store. VEC_MASK is the vectorized
@@ -7655,6 +7682,14 @@ vectorizable_store (vec_info *vinfo,
= (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
? &LOOP_VINFO_MASKS (loop_vinfo)
: NULL);
+ vec_loop_lens *loop_lens
+ = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
+ ? &LOOP_VINFO_LENS (loop_vinfo)
+ : NULL);
+
+ /* Shouldn't go with length-based approach if fully masked. */
+ gcc_assert (!loop_lens || (loop_lens && !loop_masks));
+
/* Targets with store-lane instructions must not require explicit
realignment. vect_supportable_dr_alignment always returns either
dr_aligned or dr_unaligned_supported for masked operations. */
@@ -7911,10 +7946,16 @@ vectorizable_store (vec_info *vinfo,
unsigned HOST_WIDE_INT align;
tree final_mask = NULL_TREE;
+ tree final_len = NULL_TREE;
if (loop_masks)
final_mask = vect_get_loop_mask (gsi, loop_masks,
vec_num * ncopies,
vectype, vec_num * j + i);
+ else if (loop_lens)
+ final_len = vect_get_loop_len (loop_vinfo, loop_lens,
+ vec_num * ncopies,
+ vec_num * j + i);
+
if (vec_mask)
final_mask = prepare_load_store_mask (mask_vectype, final_mask,
vec_mask, gsi);
@@ -7994,6 +8035,17 @@ vectorizable_store (vec_info *vinfo,
vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
new_stmt = call;
}
+ else if (final_len)
+ {
+ align = least_bit_hwi (misalign | align);
+ tree ptr = build_int_cst (ref_type, align);
+ gcall *call
+ = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
+ ptr, final_len, vec_oprnd);
+ gimple_call_set_nothrow (call, true);
+ vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+ new_stmt = call;
+ }
else
{
data_ref = fold_build2 (MEM_REF, vectype,
@@ -8531,6 +8583,7 @@ vectorizable_load (vec_info *vinfo,
tree dr_offset;
gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+ gcc_assert (!LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo));
gcc_assert (!nested_in_vect_loop);
if (grouped_load)
@@ -8819,6 +8872,14 @@ vectorizable_load (vec_info *vinfo,
= (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
? &LOOP_VINFO_MASKS (loop_vinfo)
: NULL);
+ vec_loop_lens *loop_lens
+ = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
+ ? &LOOP_VINFO_LENS (loop_vinfo)
+ : NULL);
+
+ /* Shouldn't go with length-based approach if fully masked. */
+ gcc_assert (!loop_lens || (loop_lens && !loop_masks));
+
/* Targets with store-lane instructions must not require explicit
realignment. vect_supportable_dr_alignment always returns either
dr_aligned or dr_unaligned_supported for masked operations. */
@@ -9134,11 +9195,18 @@ vectorizable_load (vec_info *vinfo,
for (i = 0; i < vec_num; i++)
{
tree final_mask = NULL_TREE;
+ tree final_len = NULL_TREE;
if (loop_masks
&& memory_access_type != VMAT_INVARIANT)
final_mask = vect_get_loop_mask (gsi, loop_masks,
vec_num * ncopies,
vectype, vec_num * j + i);
+ else if (loop_lens
+ && memory_access_type != VMAT_INVARIANT)
+ final_len = vect_get_loop_len (loop_vinfo, loop_lens,
+ vec_num * ncopies,
+ vec_num * j + i);
+
if (vec_mask)
final_mask = prepare_load_store_mask (mask_vectype, final_mask,
vec_mask, gsi);
@@ -9207,6 +9275,18 @@ vectorizable_load (vec_info *vinfo,
new_stmt = call;
data_ref = NULL_TREE;
}
+ else if (final_len)
+ {
+ align = least_bit_hwi (misalign | align);
+ tree ptr = build_int_cst (ref_type, align);
+ gcall *call
+ = gimple_build_call_internal (IFN_LEN_LOAD, 3,
+ dataref_ptr, ptr,
+ final_len);
+ gimple_call_set_nothrow (call, true);
+ new_stmt = call;
+ data_ref = NULL_TREE;
+ }
else
{
tree ltype = vectype;
@@ -9850,11 +9930,30 @@ vectorizable_condition (vec_info *vinfo,
return false;
}
- if (loop_vinfo
- && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
- && reduction_type == EXTRACT_LAST_REDUCTION)
- vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
- ncopies * vec_num, vectype, NULL);
+ if (loop_vinfo && for_reduction
+ && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ if (reduction_type == EXTRACT_LAST_REDUCTION)
+ vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
+ ncopies * vec_num, vectype, NULL);
+ /* Using partial vectors can introduce inactive lanes in the last
+ iteration, since full vector of condition results are operated,
+ it's unsafe here. But if we can AND the condition mask with
+ loop mask, it would be safe then. */
+ else if (!loop_vinfo->scalar_cond_masked_set.is_empty ())
+ {
+ scalar_cond_masked_key cond (cond_expr, ncopies * vec_num);
+ if (!loop_vinfo->scalar_cond_masked_set.contains (cond))
+ {
+ bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
+ cond.code = invert_tree_comparison (cond.code, honor_nans);
+ if (!loop_vinfo->scalar_cond_masked_set.contains (cond))
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ }
+ else
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
vect_model_simple_cost (vinfo, stmt_info, ncopies, dts, ndts, slp_node,
@@ -11910,3 +12009,36 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
*nunits_vectype_out = nunits_vectype;
return opt_result::success ();
}
+
+/* Generate and return statement sequence that sets vector length LEN that is:
+
+ min_of_start_and_end = min (START_INDEX, END_INDEX);
+ left_len = END_INDEX - min_of_start_and_end;
+ rhs = min (left_len, LEN_LIMIT);
+ LEN = rhs;
+
+ TODO: for now, rs6000 supported vector with length only cares 8-bits, which
+ means if we have left_len in bytes larger than 255, it can't be saturated to
+ vector limit (vector size). One target hook can be provided if other ports
+ don't suffer this.
+*/
+
+gimple_seq
+vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit)
+{
+ gimple_seq stmts = NULL;
+ tree len_type = TREE_TYPE (len);
+ gcc_assert (TREE_TYPE (start_index) == len_type);
+
+ tree min = fold_build2 (MIN_EXPR, len_type, start_index, end_index);
+ tree left_len = fold_build2 (MINUS_EXPR, len_type, end_index, min);
+ left_len = fold_build2 (MIN_EXPR, len_type, left_len, len_limit);
+
+ tree rhs = force_gimple_operand (left_len, &stmts, true, NULL_TREE);
+ gimple *new_stmt = gimple_build_assign (len, rhs);
+ gimple_stmt_iterator i = gsi_last (stmts);
+ gsi_insert_after_without_update (&i, new_stmt, GSI_CONTINUE_LINKING);
+
+ return stmts;
+}
+
@@ -417,6 +417,16 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i)
are compile-time constants but VF and nL can be variable (if the target
supports variable-length vectors).
+ Moreover, for some approach with partial vectors like being controlled
+ by length (in bytes), it cares about the occupied bytes for each scalar.
+ Provided that each scalar has factor bytes, the total number of scalar
+ values becomes to factor * N, the above equation becomes to:
+
+ factor * N = factor * NS * VF = factor * NV * NL
+
+ factor * NS is the bytes of each scalar, factor * NL is the vector size
+ in bytes.
+
In classical vectorization, each iteration of the vector loop would
handle exactly VF iterations of the original scalar loop. However,
in vector loops that are able to operate on partial vectors, a
@@ -473,14 +483,19 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i)
first level being indexed by nV - 1 (since nV == 0 doesn't exist) and
the second being indexed by the mask index 0 <= i < nV. */
-/* The controls (like masks) needed by rgroups with nV vectors,
+/* The controls (like masks, lengths) needed by rgroups with nV vectors,
according to the description above. */
struct rgroup_controls {
/* The largest nS for all rgroups that use these controls. */
unsigned int max_nscalars_per_iter;
- /* The type of control to use, based on the highest nS recorded above.
- For mask-based approach, it's used for mask_type. */
+ /* For now, it's mainly used for length-based in bytes approach, it's
+ record the occupied bytes of each scalar. */
+ unsigned int factor;
+
+ /* This type is based on the highest nS recorded above.
+ For mask-based approach, it records mask type to use.
+ For length-based approach, it records appropriate vector type. */
tree type;
/* A vector of nV controls, in iteration order. */
@@ -489,6 +504,8 @@ struct rgroup_controls {
typedef auto_vec<rgroup_controls> vec_loop_masks;
+typedef auto_vec<rgroup_controls> vec_loop_lens;
+
typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
/*-----------------------------------------------------------------*/
@@ -536,6 +553,10 @@ public:
on inactive scalars. */
vec_loop_masks masks;
+ /* The lengths that a loop with length should use to avoid operating
+ on inactive scalars. */
+ vec_loop_lens lens;
+
/* Set of scalar conditions that have loop mask applied. */
scalar_cond_masked_set_type scalar_cond_masked_set;
@@ -644,6 +665,10 @@ public:
the vector loop can handle fewer than VF scalars. */
bool using_partial_vectors_p;
+ /* True if we've decided to use partially-populated vectors for the
+ epilogue of loop, only for length-based approach for now. */
+ bool epil_using_partial_vectors_p;
+
/* When we have grouped data accesses with gaps, we may introduce invalid
memory accesses. We peel the last iteration of the loop to prevent
this. */
@@ -707,9 +732,12 @@ public:
#define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
#define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
+ (L)->epil_using_partial_vectors_p
#define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor
#define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor
#define LOOP_VINFO_MASKS(L) (L)->masks
+#define LOOP_VINFO_LENS(L) (L)->lens
#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
#define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type
#define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type
@@ -747,6 +775,10 @@ public:
(LOOP_VINFO_USING_PARTIAL_VECTORS_P (L) \
&& !LOOP_VINFO_MASKS (L).is_empty ())
+#define LOOP_VINFO_FULLY_WITH_LENGTH_P(L) \
+ (LOOP_VINFO_USING_PARTIAL_VECTORS_P (L) \
+ && !LOOP_VINFO_LENS (L).is_empty ())
+
#define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
((L)->may_misalign_stmts.length () > 0)
#define LOOP_REQUIRES_VERSIONING_FOR_ALIAS(L) \
@@ -1866,6 +1898,11 @@ extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
unsigned int, tree, tree);
extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
+extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
+ tree);
+extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
+ unsigned int);
+extern gimple_seq vect_gen_len (tree, tree, tree, tree);
extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
/* Drive for loop transformation stage. */