@@ -120,6 +120,7 @@ _slp_tree::_slp_tree ()
SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
SLP_TREE_CODE (this) = ERROR_MARK;
+ this->ldst_lanes = false;
SLP_TREE_VECTYPE (this) = NULL_TREE;
SLP_TREE_REPRESENTATIVE (this) = NULL;
SLP_TREE_REF_COUNT (this) = 1;
@@ -3601,9 +3602,22 @@ vect_build_slp_instance (vec_info *vinfo,
size >= 1. */
else if (is_a <loop_vec_info> (vinfo)
&& (i > 0 && i < group_size)
- && !vect_slp_prefer_store_lanes_p (vinfo,
- stmt_info, group_size, i))
- {
+ /*&& !vect_slp_prefer_store_lanes_p (vinfo,
+ stmt_info, group_size, i)*/)
+ {
+ /* There are targets that cannot do even/odd interleaving schemes
+ so they absolutely need to use load/store-lanes. For now
+ force single-lane SLP for them - they would be happy with
+ uniform power-of-two lanes (but depending on element size),
+ but even if we can use 'i' as indicator we would need to
+ backtrack when later lanes fail to discover with the same
+ granularity. */
+ bool want_store_lanes
+ = vect_slp_prefer_store_lanes_p (vinfo,
+ stmt_info, group_size, 1 /*i*/);
+ if (want_store_lanes)
+ i = 1;
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
@@ -3637,7 +3651,10 @@ vect_build_slp_instance (vec_info *vinfo,
(max_nunits, end - start));
rhs_nodes.safe_push (node);
start = end;
- end = group_size;
+ if (want_store_lanes)
+ end = start + 1;
+ else
+ end = group_size;
}
else
{
@@ -3711,7 +3728,14 @@ vect_build_slp_instance (vec_info *vinfo,
we end up with two inputs. That scheme makes sure we end
up with permutes satisfying the restriction of requiring at
most two vector inputs to produce a single vector output
- when the number of lanes is even. */
+ when the number of lanes is even.
+ Leave the merge as is when we want to use store-lanes. */
+ if (want_store_lanes)
+ {
+ perm->ldst_lanes = 1;
+ node->ldst_lanes = 1;
+ }
+ else
while (SLP_TREE_CHILDREN (perm).length () > 2)
{
/* When we have three equal sized groups left the pairwise
@@ -4057,6 +4081,36 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
if (exact_log2 (group_lanes) == -1 && group_lanes != 3)
return;
+ /* Verify if all load permutations can be implemented with a suitably
+ large element load-lanes operation. */
+ unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
+ if (exact_log2 (ld_lanes_lanes) == -1
+ /* ??? Need to handle ld_lanes_lanes != 1 correctly. */
+ || !vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
+ group_lanes, false))
+ ld_lanes_lanes = 0;
+ else
+ for (slp_tree load : loads)
+ {
+ if (SLP_TREE_LANES (load) != ld_lanes_lanes)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
+ if (first % ld_lanes_lanes != 0)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
+ if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ }
+
for (slp_tree load : loads)
{
/* Leave masked or gather loads alone for now. */
@@ -4071,7 +4125,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
with a non-1:1 load permutation around instead of canonicalizing
those into a load and a permute node. Removing this early
check would do such canonicalization. */
- if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+ if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
+ && ld_lanes_lanes == 0)
continue;
/* First build (and possibly re-use) a load node for the
@@ -4104,6 +4159,12 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
final_perm.quick_push
(std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+ if (ld_lanes_lanes != 0)
+ {
+ l0->ldst_lanes = true;
+ load->ldst_lanes = true;
+ }
+ else
while (1)
{
unsigned group_lanes = SLP_TREE_LANES (l0);
@@ -9705,6 +9766,8 @@ vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
node->push_vec_def (perm_stmt);
}
+extern tree create_vector_array (tree elem_type, unsigned HOST_WIDE_INT nelems);
+
/* Subroutine of vectorizable_slp_permutation. Check whether the target
can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
If GSI is nonnull, emit the permutation there.
@@ -9758,6 +9821,77 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
gcc_assert (perm.length () == SLP_TREE_LANES (node));
+ /* Load/store-lanes permute. */
+ if (node->ldst_lanes)
+ {
+ if (!gsi)
+ /* ??? We should have this flag set on the permute before a store
+ or the permutes after a load. Check vect_load_lanes_supported
+ or leave that to the store? */
+ return 1;
+ /* For the load case there should be a single child and
+ it's vector def should be the virtual SSA def of the
+ .LOAD_LANES call. Create the vect__5.7_48 = vect_array.6[n]
+ load here plus punning of the vector type (in case an element
+ represents multiple lanes). */
+ if (children.length () == 1)
+ {
+ slp_tree child = children[0];
+ unsigned vec_num = (SLP_TREE_LANE_PERMUTATION (node)[0].second
+ / SLP_TREE_LANES (node));
+ for (tree def : SLP_TREE_VEC_DEFS (child))
+ {
+ gcc_assert (virtual_operand_p (def));
+ tree decl = gimple_call_lhs (SSA_NAME_DEF_STMT (def));
+ tree vec = make_ssa_name (TREE_TYPE (TREE_TYPE (decl)));
+ gassign *new_stmt
+ = gimple_build_assign (vec,
+ build4 (ARRAY_REF, TREE_TYPE (vec),
+ decl, build_int_cstu
+ (unsigned_type_node, vec_num),
+ NULL_TREE, NULL_TREE));
+ vect_finish_stmt_generation (vinfo, NULL, new_stmt, gsi);
+ if (!useless_type_conversion_p (vectype, TREE_TYPE (vec)))
+ {
+ vec = make_ssa_name (vectype);
+ new_stmt = gimple_build_assign (vec, build1 (VIEW_CONVERT_EXPR, vectype, gimple_assign_lhs (new_stmt)));
+ vect_finish_stmt_generation (vinfo, NULL, new_stmt, gsi);
+ }
+ node->push_vec_def (vec);
+ }
+ return SLP_TREE_VEC_DEFS (node).length ();
+ }
+ /* For the store case create the array and the set of stores from
+ all children vector defs. The vector def should then be the
+ virtual def of the last store into the array. */
+ else
+ {
+ unsigned vec_num = SLP_TREE_VEC_DEFS (children[0]).length ();
+ for (unsigned i = 0; i < vec_num; ++i)
+ {
+ tree decl = create_vector_array (op_vectype,
+ SLP_TREE_LANES (node)
+ / SLP_TREE_LANES (children[0]));
+ tree last_vdef = NULL_TREE;
+ for (unsigned j = 0; j < children.length (); ++j)
+ {
+ slp_tree child = children[j];
+ tree ref = build4 (ARRAY_REF, op_vectype, decl,
+ build_int_cstu (unsigned_type_node, j),
+ NULL_TREE, NULL_TREE);
+ gimple *new_stmt
+ = gimple_build_assign (ref, SLP_TREE_VEC_DEFS (child)[i]);
+ vect_finish_stmt_generation (vinfo, NULL, new_stmt, gsi);
+ last_vdef = make_ssa_name (gimple_vop (cfun));
+ SSA_NAME_DEF_STMT (last_vdef) = new_stmt;
+ gimple_set_vdef (new_stmt, last_vdef);
+ }
+ node->push_vec_def (last_vdef);
+ }
+ return vec_num;
+ }
+ }
+
/* REPEATING_P is true if every output vector is guaranteed to use the
same permute vector. We can handle that case for both variable-length
and constant-length vectors, but we only handle other cases for
@@ -146,7 +146,7 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
/* Return a variable of type ELEM_TYPE[NELEMS]. */
-static tree
+tree
create_vector_array (tree elem_type, unsigned HOST_WIDE_INT nelems)
{
return create_tmp_var (build_array_type_nelts (elem_type, nelems),
@@ -2069,6 +2069,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
is irrelevant for them. */
*alignment_support_scheme = dr_unaligned_supported;
}
+ /* Try using LOAD/STORE_LANES. */
+ else if (slp_node->ldst_lanes
+ && (*lanes_ifn
+ = (vls_type == VLS_LOAD
+ ? vect_load_lanes_supported (vectype, group_size, masked_p)
+ : vect_store_lanes_supported (vectype, group_size,
+ masked_p))) != IFN_LAST)
+ *memory_access_type = VMAT_LOAD_STORE_LANES;
else
*memory_access_type = VMAT_CONTIGUOUS;
@@ -8762,7 +8770,7 @@ vectorizable_store (vec_info *vinfo,
if (memory_access_type == VMAT_LOAD_STORE_LANES)
{
- gcc_assert (!slp && grouped_store);
+ //gcc_assert (!slp && grouped_store);
unsigned inside_cost = 0, prologue_cost = 0;
/* For costing some adjacent vector stores, we'd like to cost with
the total number of them once instead of cost each one by one. */
@@ -8784,7 +8792,7 @@ vectorizable_store (vec_info *vinfo,
op = vect_get_store_rhs (next_stmt_info);
if (costing_p)
update_prologue_cost (&prologue_cost, op);
- else
+ else if (!slp)
{
vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
ncopies, op,
@@ -8799,15 +8807,15 @@ vectorizable_store (vec_info *vinfo,
{
if (mask)
{
- vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
- mask, &vec_masks,
- mask_vectype);
+ if (slp_node)
+ vect_get_slp_defs (mask_node, &vec_masks);
+ else
+ vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
+ mask, &vec_masks,
+ mask_vectype);
vec_mask = vec_masks[0];
}
- /* We should have catched mismatched types earlier. */
- gcc_assert (
- useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
dataref_ptr
= vect_create_data_ref_ptr (vinfo, first_stmt_info,
aggr_type, NULL, offset, &dummy,
@@ -8819,11 +8827,17 @@ vectorizable_store (vec_info *vinfo,
gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
/* DR_CHAIN is then used as an input to
vect_permute_store_chain(). */
+ if (!slp)
+ {
+ /* We should have catched mismatched types earlier. */
+ gcc_assert (
+ useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
for (i = 0; i < group_size; i++)
{
vec_oprnd = (*gvec_oprnds[i])[j];
dr_chain[i] = vec_oprnd;
}
+ }
if (mask)
vec_mask = vec_masks[j];
dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
@@ -8836,8 +8850,16 @@ vectorizable_store (vec_info *vinfo,
continue;
}
+ tree vec_array;
+ if (slp)
+ {
+ tree def = SLP_TREE_VEC_DEFS (SLP_TREE_CHILDREN (slp_node)[0])[j];
+ vec_array = TREE_OPERAND (gimple_assign_lhs (SSA_NAME_DEF_STMT (def)), 0);
+ }
+ else
+ {
/* Get an array into which we can store the individual vectors. */
- tree vec_array = create_vector_array (vectype, vec_num);
+ vec_array = create_vector_array (vectype, vec_num);
/* Invalidate the current contents of VEC_ARRAY. This should
become an RTL clobber too, which prevents the vector registers
@@ -8851,6 +8873,7 @@ vectorizable_store (vec_info *vinfo,
write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
i);
}
+ }
tree final_mask = NULL;
tree final_len = NULL;
@@ -8917,9 +8940,10 @@ vectorizable_store (vec_info *vinfo,
/* Record that VEC_ARRAY is now dead. */
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
- if (j == 0)
+ if (j == 0 && !slp)
*vec_stmt = new_stmt;
- STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ if (!slp)
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
if (costing_p)
@@ -10765,7 +10789,7 @@ vectorizable_load (vec_info *vinfo,
{
gcc_assert (alignment_support_scheme == dr_aligned
|| alignment_support_scheme == dr_unaligned_supported);
- gcc_assert (grouped_load && !slp);
+ //gcc_assert (grouped_load && !slp);
unsigned int inside_cost = 0, prologue_cost = 0;
/* For costing some adjacent vector loads, we'd like to cost with
@@ -10884,6 +10908,15 @@ vectorizable_load (vec_info *vinfo,
gimple_call_set_nothrow (call, true);
vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+ if (slp)
+ {
+ tree def = copy_ssa_name (gimple_vuse (call));
+ gimple_set_vdef (call, def);
+ SSA_NAME_DEF_STMT (def) = call;
+ slp_node->push_vec_def (def);
+ }
+ else
+ {
dr_chain.create (vec_num);
/* Extract each vector into an SSA_NAME. */
for (i = 0; i < vec_num; i++)
@@ -10900,8 +10933,10 @@ vectorizable_load (vec_info *vinfo,
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
dr_chain.release ();
+ }
- *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+ if (!slp_node)
+ *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
}
if (costing_p)
@@ -222,6 +222,9 @@ struct _slp_tree {
unsigned int lanes;
/* The operation of this node. */
enum tree_code code;
+ /* Whether uses of this load or feeders of this store are suitable
+ for load/store-lanes. */
+ bool ldst_lanes;
int vertex;