new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+
+/* PR82255: Ensure we don't require a vec_construct cost when we aren't
+ going to generate a strided load. */
+
+extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__))
+__attribute__ ((__const__));
+
+static int
+foo (unsigned char *w, int i, unsigned char *x, int j)
+{
+ int tot = 0;
+ for (int a = 0; a < 16; a++)
+ {
+#pragma GCC unroll 16
+ for (int b = 0; b < 16; b++)
+ tot += abs (w[b] - x[b]);
+ w += i;
+ x += j;
+ }
+ return tot;
+}
+
+void
+bar (unsigned char *w, unsigned char *x, int i, int *result)
+{
+ *result = foo (w, 16, x, i);
+}
+
+/* { dg-final { scan-tree-dump-times "vec_construct" 0 "vect" } } */
@@ -897,6 +897,146 @@ cfun_returns (tree decl)
return false;
}
+/* Function VECTOR_VECTOR_COMPOSITION_TYPE
+
+ This function returns a vector type which can be composed with NETLS pieces,
+ whose type is recorded in PTYPE. VTYPE should be a vector type, and has the
+ same vector size as the return vector. It checks target whether supports
+ pieces-size vector mode for construction firstly, if target fails to, check
+ pieces-size scalar mode for construction further. It returns NULL_TREE if
+ fails to find the available composition.
+
+ For example, for (vtype=V16QI, nelts=4), we can probably get:
+ - V16QI with PTYPE V4QI.
+ - V4SI with PTYPE SI.
+ - NULL_TREE. */
+
+static tree
+vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
+{
+ gcc_assert (VECTOR_TYPE_P (vtype));
+ gcc_assert (known_gt (nelts, 0U));
+
+ machine_mode vmode = TYPE_MODE (vtype);
+ if (!VECTOR_MODE_P (vmode))
+ return NULL_TREE;
+
+ poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
+ unsigned int pbsize;
+ if (constant_multiple_p (vbsize, nelts, &pbsize))
+ {
+ /* First check if vec_init optab supports construction from
+ vector pieces directly. */
+ scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
+ poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
+ machine_mode rmode;
+ if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
+ && (convert_optab_handler (vec_init_optab, vmode, rmode)
+ != CODE_FOR_nothing))
+ {
+ *ptype = build_vector_type (TREE_TYPE (vtype), inelts);
+ return vtype;
+ }
+
+ /* Otherwise check if exists an integer type of the same piece size and
+ if vec_init optab supports construction from it directly. */
+ if (int_mode_for_size (pbsize, 0).exists (&elmode)
+ && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
+ && (convert_optab_handler (vec_init_optab, rmode, elmode)
+ != CODE_FOR_nothing))
+ {
+ *ptype = build_nonstandard_integer_type (pbsize, 1);
+ return build_vector_type (*ptype, nelts);
+ }
+ }
+
+ return NULL_TREE;
+}
+
+/* Hold information for VMAT_ELEMENTWISE or VMAT_STRIDED_SLP strided
+ loads in function vectorizable_load. */
+struct strided_load_info {
+ /* Number of loads required. */
+ int nloads;
+ /* Number of vector unit advanced for each load. */
+ int lnel;
+ /* Access type for each load. */
+ tree ltype;
+ /* Vector type used for possible vector construction. */
+ tree lvectype;
+};
+
+/* Determine how we perform VMAT_ELEMENTWISE or VMAT_STRIDED_SLP loads by
+ considering vector units number, group size and target supports for
+ vector construction, return the strided_load_info information.
+ ACCESS_TYPE indicates memory access type, VECTYPE indicates vector
+ type, GROUP_SIZE indicates group size for grouped_access. */
+
+static strided_load_info
+vect_get_strided_load_info (vect_memory_access_type access_type, tree vectype,
+ int group_size)
+{
+ int nloads, lnel;
+ tree ltype, lvectype;
+
+ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ /* Checked by get_load_store_type. */
+ unsigned int const_nunits = nunits.to_constant ();
+
+ if (access_type == VMAT_STRIDED_SLP)
+ {
+ if ((unsigned int) group_size < const_nunits)
+ {
+ /* First check if vec_init optab supports construction from vector
+ elts directly. Otherwise avoid emitting a constructor of
+ vector elements by performing the loads using an integer type
+ of the same size, constructing a vector of those and then
+ re-interpreting it as the original vector type. This avoids a
+ huge runtime penalty due to the general inability to perform
+ store forwarding from smaller stores to a larger load. */
+ tree ptype;
+ unsigned int nelts = const_nunits / group_size;
+ tree vtype = vector_vector_composition_type (vectype, nelts, &ptype);
+ if (vtype != NULL_TREE)
+ {
+ nloads = nelts;
+ lnel = group_size;
+ ltype = ptype;
+ lvectype = vtype;
+ }
+ else
+ {
+ nloads = const_nunits;
+ lnel = 1;
+ ltype = TREE_TYPE (vectype);
+ lvectype = vectype;
+ }
+ }
+ else
+ {
+ nloads = 1;
+ lnel = const_nunits;
+ ltype = vectype;
+ lvectype = vectype;
+ }
+ ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
+ }
+ else
+ {
+ gcc_assert (access_type == VMAT_ELEMENTWISE);
+ nloads = const_nunits;
+ lnel = 1;
+ /* Load vector(1) scalar_type if it's 1 element-wise vectype. */
+ if (nloads == 1)
+ ltype = vectype;
+ else
+ ltype = TREE_TYPE (vectype);
+ lvectype = vectype;
+ }
+
+ return {nloads, lnel, ltype, lvectype};
+}
+
/* Function vect_model_store_cost
Models cost for stores. In the case of grouped accesses, one access
@@ -1157,8 +1297,22 @@ vect_model_load_cost (vec_info *vinfo,
cost_vec, cost_vec, true);
if (memory_access_type == VMAT_ELEMENTWISE
|| memory_access_type == VMAT_STRIDED_SLP)
- inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
- stmt_info, 0, vect_body);
+ {
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ int group_size = 1;
+ if (slp_node && grouped_access_p)
+ {
+ first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ group_size = DR_GROUP_SIZE (first_stmt_info);
+ }
+ strided_load_info
+ info = vect_get_strided_load_info (memory_access_type,
+ vectype, group_size);
+ /* Only requires vec_construct when number of loads > 1. */
+ if (info.nloads > 1)
+ inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
+ stmt_info, 0, vect_body);
+ }
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
@@ -1996,62 +2150,6 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
gcc_unreachable ();
}
-/* Function VECTOR_VECTOR_COMPOSITION_TYPE
-
- This function returns a vector type which can be composed with NETLS pieces,
- whose type is recorded in PTYPE. VTYPE should be a vector type, and has the
- same vector size as the return vector. It checks target whether supports
- pieces-size vector mode for construction firstly, if target fails to, check
- pieces-size scalar mode for construction further. It returns NULL_TREE if
- fails to find the available composition.
-
- For example, for (vtype=V16QI, nelts=4), we can probably get:
- - V16QI with PTYPE V4QI.
- - V4SI with PTYPE SI.
- - NULL_TREE. */
-
-static tree
-vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
-{
- gcc_assert (VECTOR_TYPE_P (vtype));
- gcc_assert (known_gt (nelts, 0U));
-
- machine_mode vmode = TYPE_MODE (vtype);
- if (!VECTOR_MODE_P (vmode))
- return NULL_TREE;
-
- poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
- unsigned int pbsize;
- if (constant_multiple_p (vbsize, nelts, &pbsize))
- {
- /* First check if vec_init optab supports construction from
- vector pieces directly. */
- scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
- poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
- machine_mode rmode;
- if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
- && (convert_optab_handler (vec_init_optab, vmode, rmode)
- != CODE_FOR_nothing))
- {
- *ptype = build_vector_type (TREE_TYPE (vtype), inelts);
- return vtype;
- }
-
- /* Otherwise check if exists an integer type of the same piece size and
- if vec_init optab supports construction from it directly. */
- if (int_mode_for_size (pbsize, 0).exists (&elmode)
- && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
- && (convert_optab_handler (vec_init_optab, rmode, elmode)
- != CODE_FOR_nothing))
- {
- *ptype = build_nonstandard_integer_type (pbsize, 1);
- return build_vector_type (*ptype, nelts);
- }
- }
-
- return NULL_TREE;
-}
-
/* A subroutine of get_load_store_type, with a subset of the same
arguments. Handle the case where STMT_INFO is part of a grouped load
or store.
@@ -8745,49 +8843,7 @@ vectorizable_load (vec_info *vinfo,
stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
- running_off = offvar;
- alias_off = build_int_cst (ref_type, 0);
- int nloads = const_nunits;
- int lnel = 1;
- tree ltype = TREE_TYPE (vectype);
- tree lvectype = vectype;
auto_vec<tree> dr_chain;
- if (memory_access_type == VMAT_STRIDED_SLP)
- {
- if (group_size < const_nunits)
- {
- /* First check if vec_init optab supports construction from vector
- elts directly. Otherwise avoid emitting a constructor of
- vector elements by performing the loads using an integer type
- of the same size, constructing a vector of those and then
- re-interpreting it as the original vector type. This avoids a
- huge runtime penalty due to the general inability to perform
- store forwarding from smaller stores to a larger load. */
- tree ptype;
- tree vtype
- = vector_vector_composition_type (vectype,
- const_nunits / group_size,
- &ptype);
- if (vtype != NULL_TREE)
- {
- nloads = const_nunits / group_size;
- lnel = group_size;
- lvectype = vtype;
- ltype = ptype;
- }
- }
- else
- {
- nloads = 1;
- lnel = const_nunits;
- ltype = vectype;
- }
- ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
- }
- /* Load vector(1) scalar_type if it's 1 element-wise vectype. */
- else if (nloads == 1)
- ltype = vectype;
-
if (slp)
{
/* For SLP permutation support we need to load the whole group,
@@ -8804,6 +8860,13 @@ vectorizable_load (vec_info *vinfo,
else
ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
}
+
+ running_off = offvar;
+ alias_off = build_int_cst (ref_type, 0);
+ strided_load_info
+ sload_info = vect_get_strided_load_info (memory_access_type, vectype, group_size);
+ int nloads = sload_info.nloads;
+ tree ltype = sload_info.ltype;
unsigned int group_el = 0;
unsigned HOST_WIDE_INT
elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
@@ -8824,7 +8887,7 @@ vectorizable_load (vec_info *vinfo,
CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
gimple_assign_lhs (new_stmt));
- group_el += lnel;
+ group_el += sload_info.lnel;
if (! slp
|| group_el == group_size)
{
@@ -8839,11 +8902,11 @@ vectorizable_load (vec_info *vinfo,
}
if (nloads > 1)
{
- tree vec_inv = build_constructor (lvectype, v);
- new_temp = vect_init_vector (vinfo, stmt_info,
- vec_inv, lvectype, gsi);
+ tree vec_inv = build_constructor (sload_info.lvectype, v);
+ new_temp = vect_init_vector (vinfo, stmt_info, vec_inv,
+ sload_info.lvectype, gsi);
new_stmt = SSA_NAME_DEF_STMT (new_temp);
- if (lvectype != vectype)
+ if (sload_info.lvectype != vectype)
{
new_stmt = gimple_build_assign (make_ssa_name (vectype),
VIEW_CONVERT_EXPR,