new file mode 100644
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_int_mult } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+void foo (int * __restrict a, int *b, int *c)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[2*i] = b[i] + 7;
+ a[2*i+1] = c[i] * 3;
+ }
+}
+
+int bar (int *b)
+{
+ int res = 0;
+ for (int i = 0; i < 1024; ++i)
+ {
+ res += b[2*i] + 7;
+ res += b[2*i+1] * 3;
+ }
+ return res;
+}
+
+void baz (int * __restrict a, int *b)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[2*i] = b[2*i] + 7;
+ a[2*i+1] = b[2*i+1] * 3;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "LOAD_LANES" 2 "optimized" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "STORE_LANES" 2 "optimized" { target vect_load_lanes } } } */
@@ -2957,82 +2957,6 @@ start_over:
"unsupported SLP instances\n");
goto again;
}
-
- /* Check whether any load in ALL SLP instances is possibly permuted. */
- slp_tree load_node, slp_root;
- unsigned i, x;
- slp_instance instance;
- bool can_use_lanes = true;
- FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
- {
- slp_root = SLP_INSTANCE_TREE (instance);
- int group_size = SLP_TREE_LANES (slp_root);
- tree vectype = SLP_TREE_VECTYPE (slp_root);
- bool loads_permuted = false;
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
- {
- if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
- continue;
- unsigned j;
- stmt_vec_info load_info;
- FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
- if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
- {
- loads_permuted = true;
- break;
- }
- }
-
- /* If the loads and stores can be handled with load/store-lane
- instructions record it and move on to the next instance. */
- if (loads_permuted
- && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
- && vect_store_lanes_supported (vectype, group_size, false)
- != IFN_LAST)
- {
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
- if (STMT_VINFO_GROUPED_ACCESS
- (SLP_TREE_REPRESENTATIVE (load_node)))
- {
- stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
- (SLP_TREE_REPRESENTATIVE (load_node));
- /* Use SLP for strided accesses (or if we can't
- load-lanes). */
- if (STMT_VINFO_STRIDED_P (stmt_vinfo)
- || vect_load_lanes_supported
- (STMT_VINFO_VECTYPE (stmt_vinfo),
- DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
- break;
- }
-
- can_use_lanes
- = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
-
- if (can_use_lanes && dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "SLP instance %p can use load/store-lanes\n",
- (void *) instance);
- }
- else
- {
- can_use_lanes = false;
- break;
- }
- }
-
- /* If all SLP instances can use load/store-lanes abort SLP and try again
- with SLP disabled. */
- if (can_use_lanes)
- {
- ok = opt_result::failure_at (vect_location,
- "Built SLP cancelled: can use "
- "load/store-lanes\n");
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "Built SLP cancelled: all SLP instances support "
- "load/store-lanes\n");
- goto again;
- }
}
/* Dissolve SLP-only groups. */
@@ -120,6 +120,7 @@ _slp_tree::_slp_tree ()
SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
SLP_TREE_CODE (this) = ERROR_MARK;
+ this->ldst_lanes = false;
SLP_TREE_VECTYPE (this) = NULL_TREE;
SLP_TREE_REPRESENTATIVE (this) = NULL;
SLP_TREE_REF_COUNT (this) = 1;
@@ -3600,10 +3601,24 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split the RHS into arbitrary pieces of
size >= 1. */
else if (is_a <loop_vec_info> (vinfo)
- && (i > 0 && i < group_size)
- && !vect_slp_prefer_store_lanes_p (vinfo,
- stmt_info, group_size, i))
- {
+ && (i > 0 && i < group_size))
+ {
+ /* There are targets that cannot do even/odd interleaving schemes
+ so they absolutely need to use load/store-lanes. For now
+ force single-lane SLP for them - they would be happy with
+ uniform power-of-two lanes (but depending on element size),
+ but even if we can use 'i' as indicator we would need to
+ backtrack when later lanes fail to discover with the same
+ granularity. We cannot turn any of .MASK_STORE or
+ scatter store into store-lanes. */
+ bool want_store_lanes
+ = (! is_a <gcall *> (stmt_info->stmt)
+ && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+ && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
+ group_size, 1));
+ if (want_store_lanes)
+ i = 1;
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
@@ -3637,7 +3652,10 @@ vect_build_slp_instance (vec_info *vinfo,
(max_nunits, end - start));
rhs_nodes.safe_push (node);
start = end;
- end = group_size;
+ if (want_store_lanes)
+ end = start + 1;
+ else
+ end = group_size;
}
else
{
@@ -3676,6 +3694,18 @@ vect_build_slp_instance (vec_info *vinfo,
SLP_TREE_CHILDREN
(rhs_nodes[0]).length ());
SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ if (want_store_lanes)
+ {
+ /* For store-lanes feed the store node with all RHS nodes
+ in order. We cannot handle .MASK_STORE here. */
+ gcc_assert (SLP_TREE_CHILDREN (rhs_nodes[0]).length () == 1);
+ node->ldst_lanes = 1;
+ SLP_TREE_CHILDREN (node).reserve_exact (rhs_nodes.length ());
+ for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+ SLP_TREE_CHILDREN (node)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ }
+ else
for (unsigned l = 0;
l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
{
@@ -4057,6 +4087,42 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
if (exact_log2 (group_lanes) == -1 && group_lanes != 3)
return;
+ /* Verify if all load permutations can be implemented with a suitably
+ large element load-lanes operation. */
+ unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
+ if (exact_log2 (ld_lanes_lanes) == -1
+ /* ??? For now only support the single-lane case as there is
+ missing support on the store-lane side and code generation
+ isn't up to the task yet. */
+ || ld_lanes_lanes != 1
+ || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
+ group_lanes / ld_lanes_lanes,
+ false) == IFN_LAST)
+ ld_lanes_lanes = 0;
+ else
+ /* Verify the loads access the same number of lanes aligned to
+ ld_lanes_lanes. */
+ for (slp_tree load : loads)
+ {
+ if (SLP_TREE_LANES (load) != ld_lanes_lanes)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
+ if (first % ld_lanes_lanes != 0)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
+ if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ }
+
for (slp_tree load : loads)
{
/* Leave masked or gather loads alone for now. */
@@ -4071,7 +4137,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
with a non-1:1 load permutation around instead of canonicalizing
those into a load and a permute node. Removing this early
check would do such canonicalization. */
- if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+ if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
+ && ld_lanes_lanes == 0)
continue;
/* First build (and possibly re-use) a load node for the
@@ -4104,6 +4171,12 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
final_perm.quick_push
(std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+ if (ld_lanes_lanes != 0)
+ {
+ l0->ldst_lanes = true;
+ load->ldst_lanes = true;
+ }
+ else
while (1)
{
unsigned group_lanes = SLP_TREE_LANES (l0);
@@ -9758,6 +9831,28 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
gcc_assert (perm.length () == SLP_TREE_LANES (node));
+ /* Load-lanes permute. This permute only acts as a forwarder to
+ select the correct vector def of the load-lanes load which
+ has the permuted vectors in its vector defs like
+ { v0, w0, r0, v1, w1, r1 ... } for a ld3. */
+ if (node->ldst_lanes)
+ {
+ gcc_assert (children.length () == 1);
+ if (!gsi)
+ /* This is a trivial op always supported. */
+ return 1;
+ slp_tree child = children[0];
+ unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
+ / SLP_TREE_LANES (node));
+ unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
+ for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
+ {
+ tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
+ node->push_vec_def (def);
+ }
+ return 1;
+ }
+
/* REPEATING_P is true if every output vector is guaranteed to use the
same permute vector. We can handle that case for both variable-length
and constant-length vectors, but we only handle other cases for
@@ -1508,7 +1508,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
unsigned int nvectors;
if (slp_node)
- nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ /* ??? Incorrect for multi-lane lanes. */
+ nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
else
nvectors = vect_get_num_copies (loop_vinfo, vectype);
@@ -2069,6 +2070,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
is irrelevant for them. */
*alignment_support_scheme = dr_unaligned_supported;
}
+ /* Try using LOAD/STORE_LANES. */
+ else if (slp_node->ldst_lanes
+ && (*lanes_ifn
+ = (vls_type == VLS_LOAD
+ ? vect_load_lanes_supported (vectype, group_size, masked_p)
+ : vect_store_lanes_supported (vectype, group_size,
+ masked_p))) != IFN_LAST)
+ *memory_access_type = VMAT_LOAD_STORE_LANES;
else
*memory_access_type = VMAT_CONTIGUOUS;
@@ -8705,7 +8714,7 @@ vectorizable_store (vec_info *vinfo,
else
{
if (memory_access_type == VMAT_LOAD_STORE_LANES)
- aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
else
aggr_type = vectype;
bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
@@ -8762,11 +8771,12 @@ vectorizable_store (vec_info *vinfo,
if (memory_access_type == VMAT_LOAD_STORE_LANES)
{
- gcc_assert (!slp && grouped_store);
unsigned inside_cost = 0, prologue_cost = 0;
/* For costing some adjacent vector stores, we'd like to cost with
the total number of them once instead of cost each one by one. */
unsigned int n_adjacent_stores = 0;
+ if (slp)
+ ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
for (j = 0; j < ncopies; j++)
{
gimple *new_stmt;
@@ -8784,7 +8794,7 @@ vectorizable_store (vec_info *vinfo,
op = vect_get_store_rhs (next_stmt_info);
if (costing_p)
update_prologue_cost (&prologue_cost, op);
- else
+ else if (!slp)
{
vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
ncopies, op,
@@ -8799,15 +8809,15 @@ vectorizable_store (vec_info *vinfo,
{
if (mask)
{
- vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
- mask, &vec_masks,
- mask_vectype);
+ if (slp_node)
+ vect_get_slp_defs (mask_node, &vec_masks);
+ else
+ vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
+ mask, &vec_masks,
+ mask_vectype);
vec_mask = vec_masks[0];
}
- /* We should have catched mismatched types earlier. */
- gcc_assert (
- useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
dataref_ptr
= vect_create_data_ref_ptr (vinfo, first_stmt_info,
aggr_type, NULL, offset, &dummy,
@@ -8819,10 +8829,16 @@ vectorizable_store (vec_info *vinfo,
gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
/* DR_CHAIN is then used as an input to
vect_permute_store_chain(). */
- for (i = 0; i < group_size; i++)
+ if (!slp)
{
- vec_oprnd = (*gvec_oprnds[i])[j];
- dr_chain[i] = vec_oprnd;
+ /* We should have catched mismatched types earlier. */
+ gcc_assert (
+ useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
+ for (i = 0; i < group_size; i++)
+ {
+ vec_oprnd = (*gvec_oprnds[i])[j];
+ dr_chain[i] = vec_oprnd;
+ }
}
if (mask)
vec_mask = vec_masks[j];
@@ -8832,12 +8848,12 @@ vectorizable_store (vec_info *vinfo,
if (costing_p)
{
- n_adjacent_stores += vec_num;
+ n_adjacent_stores += group_size;
continue;
}
/* Get an array into which we can store the individual vectors. */
- tree vec_array = create_vector_array (vectype, vec_num);
+ tree vec_array = create_vector_array (vectype, group_size);
/* Invalidate the current contents of VEC_ARRAY. This should
become an RTL clobber too, which prevents the vector registers
@@ -8845,9 +8861,13 @@ vectorizable_store (vec_info *vinfo,
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
/* Store the individual vectors into the array. */
- for (i = 0; i < vec_num; i++)
+ for (i = 0; i < group_size; i++)
{
- vec_oprnd = dr_chain[i];
+ if (slp)
+ vec_oprnd
+ = SLP_TREE_VEC_DEFS (SLP_TREE_CHILDREN (slp_node)[i])[j];
+ else
+ vec_oprnd = dr_chain[i];
write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
i);
}
@@ -8917,9 +8937,10 @@ vectorizable_store (vec_info *vinfo,
/* Record that VEC_ARRAY is now dead. */
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
- if (j == 0)
+ if (j == 0 && !slp)
*vec_stmt = new_stmt;
- STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ if (!slp)
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
if (costing_p)
@@ -10765,12 +10786,13 @@ vectorizable_load (vec_info *vinfo,
{
gcc_assert (alignment_support_scheme == dr_aligned
|| alignment_support_scheme == dr_unaligned_supported);
- gcc_assert (grouped_load && !slp);
unsigned int inside_cost = 0, prologue_cost = 0;
/* For costing some adjacent vector loads, we'd like to cost with
the total number of them once instead of cost each one by one. */
unsigned int n_adjacent_loads = 0;
+ if (slp_node)
+ ncopies = slp_node->vec_stmts_size / vec_num;
for (j = 0; j < ncopies; j++)
{
if (costing_p)
@@ -10884,24 +10906,31 @@ vectorizable_load (vec_info *vinfo,
gimple_call_set_nothrow (call, true);
vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
- dr_chain.create (vec_num);
+ if (!slp)
+ dr_chain.create (vec_num);
/* Extract each vector into an SSA_NAME. */
for (i = 0; i < vec_num; i++)
{
new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
vec_array, i);
- dr_chain.quick_push (new_temp);
+ if (slp)
+ slp_node->push_vec_def (new_temp);
+ else
+ dr_chain.quick_push (new_temp);
}
- /* Record the mapping between SSA_NAMEs and statements. */
- vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
+ if (!slp)
+ /* Record the mapping between SSA_NAMEs and statements. */
+ vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
/* Record that VEC_ARRAY is now dead. */
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
- dr_chain.release ();
+ if (!slp)
+ dr_chain.release ();
- *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+ if (!slp_node)
+ *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
}
if (costing_p)
@@ -222,6 +222,9 @@ struct _slp_tree {
unsigned int lanes;
/* The operation of this node. */
enum tree_code code;
+ /* Whether uses of this load or feeders of this store are suitable
+ for load/store-lanes. */
+ bool ldst_lanes;
int vertex;