new file mode 100644
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_int_mult } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+void foo (int * __restrict a, int *b, int *c)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[2*i] = b[i] + 7;
+ a[2*i+1] = c[i] * 3;
+ }
+}
+
+int bar (int *b)
+{
+ int res = 0;
+ for (int i = 0; i < 1024; ++i)
+ {
+ res += b[2*i] + 7;
+ res += b[2*i+1] * 3;
+ }
+ return res;
+}
+
+void baz (int * __restrict a, int *b)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[2*i] = b[2*i] + 7;
+ a[2*i+1] = b[2*i+1] * 3;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "LOAD_LANES" 2 "optimized" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "STORE_LANES" 2 "optimized" { target vect_load_lanes } } } */
new file mode 100644
@@ -0,0 +1,51 @@
+#include "tree-vect.h"
+
+/* This is a load-lane / masked-store-lane test that more reliably
+ triggers SLP than SVEs mask_srtuct_store_*.c */
+
+void __attribute__ ((noipa))
+test4 (int *__restrict dest, int *__restrict src,
+ int *__restrict cond, int bias, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ int value0 = src[i * 4] + bias;
+ int value1 = src[i * 4 + 1] * bias;
+ int value2 = src[i * 4 + 2] + bias;
+ int value3 = src[i * 4 + 3] * bias;
+ if (cond[i])
+ {
+ dest[i * 4] = value0;
+ dest[i * 4 + 1] = value1;
+ dest[i * 4 + 2] = value2;
+ dest[i * 4 + 3] = value3;
+ }
+ }
+}
+
+int dest[16*4];
+int src[16*4];
+int cond[16];
+const int dest_chk[16*4] = {0, 0, 0, 0, 9, 25, 11, 35, 0, 0, 0, 0, 17, 65, 19,
+ 75, 0, 0, 0, 0, 25, 105, 27, 115, 0, 0, 0, 0, 33, 145, 35, 155, 0, 0, 0,
+ 0, 41, 185, 43, 195, 0, 0, 0, 0, 49, 225, 51, 235, 0, 0, 0, 0, 57, 265, 59,
+ 275, 0, 0, 0, 0, 65, 305, 67, 315};
+
+int main()
+{
+ check_vect ();
+#pragma GCC novector
+ for (int i = 0; i < 16; ++i)
+ cond[i] = i & 1;
+#pragma GCC novector
+ for (int i = 0; i < 16 * 4; ++i)
+ src[i] = i;
+ test4 (dest, src, cond, 5, 16);
+#pragma GCC novector
+ for (int i = 0; i < 16 * 4; ++i)
+ if (dest[i] != dest_chk[i])
+ abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target { vect_variable_length && vect_load_lanes } } } } */
@@ -120,6 +120,7 @@ _slp_tree::_slp_tree ()
SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
SLP_TREE_CODE (this) = ERROR_MARK;
+ this->ldst_lanes = false;
SLP_TREE_VECTYPE (this) = NULL_TREE;
SLP_TREE_REPRESENTATIVE (this) = NULL;
SLP_TREE_REF_COUNT (this) = 1;
@@ -3902,10 +3903,28 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split the RHS into arbitrary pieces of
size >= 1. */
else if (is_a <loop_vec_info> (vinfo)
- && (i > 0 && i < group_size)
- && !vect_slp_prefer_store_lanes_p (vinfo,
- stmt_info, group_size, i))
- {
+ && (i > 0 && i < group_size))
+ {
+ /* There are targets that cannot do even/odd interleaving schemes
+ so they absolutely need to use load/store-lanes. For now
+ force single-lane SLP for them - they would be happy with
+ uniform power-of-two lanes (but depending on element size),
+ but even if we can use 'i' as indicator we would need to
+ backtrack when later lanes fail to discover with the same
+ granularity. We cannot turn any of strided or scatter store
+ into store-lanes. */
+ /* ??? If this is not in sync with what get_load_store_type
+ later decides the SLP representation is not good for other
+ store vectorization methods. */
+ bool want_store_lanes
+ = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+ && ! STMT_VINFO_STRIDED_P (stmt_info)
+ && compare_step_with_zero (vinfo, stmt_info) > 0
+ && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
+ group_size, 1));
+ if (want_store_lanes)
+ i = 1;
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
@@ -3939,7 +3958,10 @@ vect_build_slp_instance (vec_info *vinfo,
(max_nunits, end - start));
rhs_nodes.safe_push (node);
start = end;
- end = group_size;
+ if (want_store_lanes)
+ end = start + 1;
+ else
+ end = group_size;
}
else
{
@@ -3973,7 +3995,31 @@ vect_build_slp_instance (vec_info *vinfo,
}
/* Now we assume we can build the root SLP node from all stores. */
- node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
+ if (want_store_lanes)
+ {
+ /* For store-lanes feed the store node with all RHS nodes
+ in order. */
+ node = vect_create_new_slp_node (scalar_stmts,
+ SLP_TREE_CHILDREN
+ (rhs_nodes[0]).length ());
+ SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ node->ldst_lanes = true;
+ SLP_TREE_CHILDREN (node)
+ .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
+ + rhs_nodes.length () - 1);
+ /* First store value and possibly mask. */
+ SLP_TREE_CHILDREN (node)
+ .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
+ /* Rest of the store values. All mask nodes are the same,
+ this should be guaranteed by dataref group discovery. */
+ for (unsigned j = 1; j < rhs_nodes.length (); ++j)
+ SLP_TREE_CHILDREN (node)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ for (slp_tree child : SLP_TREE_CHILDREN (node))
+ child->refcnt++;
+ }
+ else
+ node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
while (!rhs_nodes.is_empty ())
vect_free_slp_tree (rhs_nodes.pop ());
@@ -4189,6 +4235,44 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
if (exact_log2 (group_lanes) == -1 && group_lanes != 3)
return;
+ /* Verify if all load permutations can be implemented with a suitably
+ large element load-lanes operation. */
+ unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
+ if (STMT_VINFO_STRIDED_P (first)
+ || compare_step_with_zero (loop_vinfo, first) <= 0
+ || exact_log2 (ld_lanes_lanes) == -1
+ /* ??? For now only support the single-lane case as there is
+ missing support on the store-lane side and code generation
+ isn't up to the task yet. */
+ || ld_lanes_lanes != 1
+ || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
+ group_lanes / ld_lanes_lanes,
+ false) == IFN_LAST)
+ ld_lanes_lanes = 0;
+ else
+ /* Verify the loads access the same number of lanes aligned to
+ ld_lanes_lanes. */
+ for (slp_tree load : loads)
+ {
+ if (SLP_TREE_LANES (load) != ld_lanes_lanes)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
+ if (first % ld_lanes_lanes != 0)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
+ if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ }
+
for (slp_tree load : loads)
{
/* Leave masked or gather loads alone for now. */
@@ -4203,7 +4287,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
with a non-1:1 load permutation around instead of canonicalizing
those into a load and a permute node. Removing this early
check would do such canonicalization. */
- if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+ if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
+ && ld_lanes_lanes == 0)
continue;
/* First build (and possibly re-use) a load node for the
@@ -4236,10 +4321,20 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
final_perm.quick_push
(std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+ if (ld_lanes_lanes != 0)
+ {
+ /* ??? If this is not in sync with what get_load_store_type
+ later decides the SLP representation is not good for other
+ store vectorization methods. */
+ l0->ldst_lanes = true;
+ load->ldst_lanes = true;
+ }
+
while (1)
{
unsigned group_lanes = SLP_TREE_LANES (l0);
- if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+ if (ld_lanes_lanes != 0
+ || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
break;
/* Try to lower by reducing the group to half its size using an
@@ -9874,6 +9969,28 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
gcc_assert (perm.length () == SLP_TREE_LANES (node));
+ /* Load-lanes permute. This permute only acts as a forwarder to
+ select the correct vector def of the load-lanes load which
+ has the permuted vectors in its vector defs like
+ { v0, w0, r0, v1, w1, r1 ... } for a ld3. */
+ if (node->ldst_lanes)
+ {
+ gcc_assert (children.length () == 1);
+ if (!gsi)
+ /* This is a trivial op always supported. */
+ return 1;
+ slp_tree child = children[0];
+ unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
+ / SLP_TREE_LANES (node));
+ unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
+ for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
+ {
+ tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
+ node->push_vec_def (def);
+ }
+ return 1;
+ }
+
/* REPEATING_P is true if every output vector is guaranteed to use the
same permute vector. We can handle that case for both variable-length
and constant-length vectors, but we only handle other cases for
@@ -1508,7 +1508,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
unsigned int nvectors;
if (slp_node)
- nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ /* ??? Incorrect for multi-lane lanes. */
+ nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
else
nvectors = vect_get_num_copies (loop_vinfo, vectype);
@@ -1794,7 +1795,7 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
elements with a known constant step. Return -1 if that step
is negative, 0 if it is zero, and 1 if it is greater than zero. */
-static int
+int
compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
{
dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
@@ -2069,6 +2070,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
is irrelevant for them. */
*alignment_support_scheme = dr_unaligned_supported;
}
+ /* Try using LOAD/STORE_LANES. */
+ else if (slp_node->ldst_lanes
+ && (*lanes_ifn
+ = (vls_type == VLS_LOAD
+ ? vect_load_lanes_supported (vectype, group_size, masked_p)
+ : vect_store_lanes_supported (vectype, group_size,
+ masked_p))) != IFN_LAST)
+ *memory_access_type = VMAT_LOAD_STORE_LANES;
else
*memory_access_type = VMAT_CONTIGUOUS;
@@ -8199,6 +8208,16 @@ vectorizable_store (vec_info *vinfo,
&lanes_ifn))
return false;
+ if (slp_node
+ && slp_node->ldst_lanes
+ && memory_access_type != VMAT_LOAD_STORE_LANES)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "discovered store-lane but cannot use it.\n");
+ return false;
+ }
+
if (mask)
{
if (memory_access_type == VMAT_CONTIGUOUS)
@@ -8715,7 +8734,7 @@ vectorizable_store (vec_info *vinfo,
else
{
if (memory_access_type == VMAT_LOAD_STORE_LANES)
- aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
else
aggr_type = vectype;
bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
@@ -8772,11 +8791,24 @@ vectorizable_store (vec_info *vinfo,
if (memory_access_type == VMAT_LOAD_STORE_LANES)
{
- gcc_assert (!slp && grouped_store);
+ if (costing_p && slp_node)
+ /* Update all incoming store operand nodes, the general handling
+ above only handles the mask and the first store operand node. */
+ for (slp_tree child : SLP_TREE_CHILDREN (slp_node))
+ if (child != mask_node
+ && !vect_maybe_update_slp_op_vectype (child, vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
unsigned inside_cost = 0, prologue_cost = 0;
/* For costing some adjacent vector stores, we'd like to cost with
the total number of them once instead of cost each one by one. */
unsigned int n_adjacent_stores = 0;
+ if (slp)
+ ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
for (j = 0; j < ncopies; j++)
{
gimple *new_stmt;
@@ -8794,7 +8826,7 @@ vectorizable_store (vec_info *vinfo,
op = vect_get_store_rhs (next_stmt_info);
if (costing_p)
update_prologue_cost (&prologue_cost, op);
- else
+ else if (!slp)
{
vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
ncopies, op,
@@ -8809,15 +8841,15 @@ vectorizable_store (vec_info *vinfo,
{
if (mask)
{
- vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
- mask, &vec_masks,
- mask_vectype);
+ if (slp_node)
+ vect_get_slp_defs (mask_node, &vec_masks);
+ else
+ vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
+ mask, &vec_masks,
+ mask_vectype);
vec_mask = vec_masks[0];
}
- /* We should have catched mismatched types earlier. */
- gcc_assert (
- useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
dataref_ptr
= vect_create_data_ref_ptr (vinfo, first_stmt_info,
aggr_type, NULL, offset, &dummy,
@@ -8829,10 +8861,16 @@ vectorizable_store (vec_info *vinfo,
gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
/* DR_CHAIN is then used as an input to
vect_permute_store_chain(). */
- for (i = 0; i < group_size; i++)
+ if (!slp)
{
- vec_oprnd = (*gvec_oprnds[i])[j];
- dr_chain[i] = vec_oprnd;
+ /* We should have caught mismatched types earlier. */
+ gcc_assert (
+ useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
+ for (i = 0; i < group_size; i++)
+ {
+ vec_oprnd = (*gvec_oprnds[i])[j];
+ dr_chain[i] = vec_oprnd;
+ }
}
if (mask)
vec_mask = vec_masks[j];
@@ -8842,12 +8880,12 @@ vectorizable_store (vec_info *vinfo,
if (costing_p)
{
- n_adjacent_stores += vec_num;
+ n_adjacent_stores += group_size;
continue;
}
/* Get an array into which we can store the individual vectors. */
- tree vec_array = create_vector_array (vectype, vec_num);
+ tree vec_array = create_vector_array (vectype, group_size);
/* Invalidate the current contents of VEC_ARRAY. This should
become an RTL clobber too, which prevents the vector registers
@@ -8855,9 +8893,19 @@ vectorizable_store (vec_info *vinfo,
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
/* Store the individual vectors into the array. */
- for (i = 0; i < vec_num; i++)
+ for (i = 0; i < group_size; i++)
{
- vec_oprnd = dr_chain[i];
+ if (slp)
+ {
+ slp_tree child;
+ if (i == 0 || !mask_node)
+ child = SLP_TREE_CHILDREN (slp_node)[i];
+ else
+ child = SLP_TREE_CHILDREN (slp_node)[i + 1];
+ vec_oprnd = SLP_TREE_VEC_DEFS (child)[j];
+ }
+ else
+ vec_oprnd = dr_chain[i];
write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
i);
}
@@ -8927,9 +8975,10 @@ vectorizable_store (vec_info *vinfo,
/* Record that VEC_ARRAY is now dead. */
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
- if (j == 0)
+ if (j == 0 && !slp)
*vec_stmt = new_stmt;
- STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ if (!slp)
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
if (costing_p)
@@ -10033,6 +10082,16 @@ vectorizable_load (vec_info *vinfo,
&lanes_ifn))
return false;
+ if (slp_node
+ && slp_node->ldst_lanes
+ && memory_access_type != VMAT_LOAD_STORE_LANES)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "discovered load-lane but cannot use it.\n");
+ return false;
+ }
+
if (mask)
{
if (memory_access_type == VMAT_CONTIGUOUS)
@@ -10775,12 +10834,13 @@ vectorizable_load (vec_info *vinfo,
{
gcc_assert (alignment_support_scheme == dr_aligned
|| alignment_support_scheme == dr_unaligned_supported);
- gcc_assert (grouped_load && !slp);
unsigned int inside_cost = 0, prologue_cost = 0;
/* For costing some adjacent vector loads, we'd like to cost with
the total number of them once instead of cost each one by one. */
unsigned int n_adjacent_loads = 0;
+ if (slp_node)
+ ncopies = slp_node->vec_stmts_size / group_size;
for (j = 0; j < ncopies; j++)
{
if (costing_p)
@@ -10831,7 +10891,7 @@ vectorizable_load (vec_info *vinfo,
if (mask)
vec_mask = vec_masks[j];
- tree vec_array = create_vector_array (vectype, vec_num);
+ tree vec_array = create_vector_array (vectype, group_size);
tree final_mask = NULL_TREE;
tree final_len = NULL_TREE;
@@ -10894,24 +10954,31 @@ vectorizable_load (vec_info *vinfo,
gimple_call_set_nothrow (call, true);
vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
- dr_chain.create (vec_num);
+ if (!slp)
+ dr_chain.create (group_size);
/* Extract each vector into an SSA_NAME. */
- for (i = 0; i < vec_num; i++)
+ for (unsigned i = 0; i < group_size; i++)
{
new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
vec_array, i);
- dr_chain.quick_push (new_temp);
+ if (slp)
+ slp_node->push_vec_def (new_temp);
+ else
+ dr_chain.quick_push (new_temp);
}
- /* Record the mapping between SSA_NAMEs and statements. */
- vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
+ if (!slp)
+ /* Record the mapping between SSA_NAMEs and statements. */
+ vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
/* Record that VEC_ARRAY is now dead. */
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
- dr_chain.release ();
+ if (!slp)
+ dr_chain.release ();
- *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+ if (!slp_node)
+ *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
}
if (costing_p)
@@ -222,6 +222,9 @@ struct _slp_tree {
unsigned int lanes;
/* The operation of this node. */
enum tree_code code;
+ /* Whether uses of this load or feeders of this store are suitable
+ for load/store-lanes. */
+ bool ldst_lanes;
int vertex;
@@ -2313,6 +2316,7 @@ extern bool supportable_indirect_convert_operation (code_helper,
tree, tree,
vec<std::pair<tree, tree_code> > *,
tree = NULL_TREE);
+extern int compare_step_with_zero (vec_info *, stmt_vec_info);
extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
enum vect_cost_for_stmt, stmt_vec_info,