new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[8*i+0] = b[s*i+0];
+ a[8*i+1] = b[s*i+1];
+ a[8*i+2] = b[s*i+2];
+ a[8*i+3] = b[s*i+3];
+ a[8*i+4] = b[s*i+4];
+ a[8*i+5] = b[s*i+5];
+ a[8*i+6] = b[s*i+4];
+ a[8*i+7] = b[s*i+5];
+ }
+}
+
+/* Three two-element loads, two four-element stores. On ia32 we elide
+ a permute and perform a redundant load. */
+/* { dg-final { scan-assembler-times "movq" 2 } } */
+/* { dg-final { scan-assembler-times "movhps" 2 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movhps" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movups" 2 } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[4*i+0] = b[s*i+0];
+ a[4*i+1] = b[s*i+1];
+ a[4*i+2] = b[s*i+0];
+ a[4*i+3] = b[s*i+1];
+ }
+}
+
+/* One two-element load, one four-element store. */
+/* { dg-final { scan-assembler-times "movq" 1 } } */
+/* { dg-final { scan-assembler-times "movups" 1 } } */
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx -fno-tree-slp-vectorize" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ if (s >= 6)
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[s*i+0] = b[4*i+0];
+ a[s*i+1] = b[4*i+1];
+ a[s*i+2] = b[4*i+2];
+ a[s*i+3] = b[4*i+3];
+ a[s*i+4] = b[4*i+0];
+ a[s*i+5] = b[4*i+1];
+ }
+}
+
+/* While the vectorizer generates 6 uint64 stores. */
+/* { dg-final { scan-assembler-times "movq" 4 } } */
+/* { dg-final { scan-assembler-times "movhps" 2 } } */
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.2 -mno-avx -fno-tree-slp-vectorize" } */
+
+void foo (int * __restrict a, int * __restrict b, int *c, int s)
+{
+ if (s >= 2)
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[s*i+0] = c[4*i+0];
+ a[s*i+1] = c[4*i+1];
+ b[s*i+0] = c[4*i+2];
+ b[s*i+1] = c[4*i+3];
+ }
+}
+
+/* Vectorization factor two, two two-element stores to a using movq
+ and two two-element stores to b via pextrq/movhps of the high part. */
+/* { dg-final { scan-assembler-times "movq" 2 } } */
+/* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */
@@ -2036,15 +2036,10 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
first_dr_info
= STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
if (STMT_VINFO_STRIDED_P (first_stmt_info))
- {
- /* Try to use consecutive accesses of DR_GROUP_SIZE elements,
- separated by the stride, until we have a complete vector.
- Fall back to scalar accesses if that isn't possible. */
- if (multiple_p (nunits, group_size))
- *memory_access_type = VMAT_STRIDED_SLP;
- else
- *memory_access_type = VMAT_ELEMENTWISE;
- }
+ /* Try to use consecutive accesses of as many elements as possible,
+ separated by the stride, until we have a complete vector.
+ Fall back to scalar accesses if that isn't possible. */
+ *memory_access_type = VMAT_STRIDED_SLP;
else
{
int cmp = compare_step_with_zero (vinfo, stmt_info);
@@ -8512,12 +8507,29 @@ vectorizable_store (vec_info *vinfo,
tree lvectype = vectype;
if (slp)
{
- if (group_size < const_nunits
- && const_nunits % group_size == 0)
+ HOST_WIDE_INT n = gcd (group_size, const_nunits);
+ if (n == const_nunits)
{
- nstores = const_nunits / group_size;
- lnel = group_size;
- ltype = build_vector_type (elem_type, group_size);
+ int mis_align = dr_misalignment (first_dr_info, vectype);
+ dr_alignment_support dr_align
+ = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
+ mis_align);
+ if (dr_align == dr_aligned
+ || dr_align == dr_unaligned_supported)
+ {
+ nstores = 1;
+ lnel = const_nunits;
+ ltype = vectype;
+ lvectype = vectype;
+ alignment_support_scheme = dr_align;
+ misalignment = mis_align;
+ }
+ }
+ else if (n > 1)
+ {
+ nstores = const_nunits / n;
+ lnel = n;
+ ltype = build_vector_type (elem_type, n);
lvectype = vectype;
/* First check if vec_extract optab doesn't support extraction
@@ -8526,7 +8538,7 @@ vectorizable_store (vec_info *vinfo,
machine_mode vmode;
if (!VECTOR_MODE_P (TYPE_MODE (vectype))
|| !related_vector_mode (TYPE_MODE (vectype), elmode,
- group_size).exists (&vmode)
+ n).exists (&vmode)
|| (convert_optab_handler (vec_extract_optab,
TYPE_MODE (vectype), vmode)
== CODE_FOR_nothing))
@@ -8537,8 +8549,8 @@ vectorizable_store (vec_info *vinfo,
re-interpreting it as the original vector type if
supported. */
unsigned lsize
- = group_size * GET_MODE_BITSIZE (elmode);
- unsigned int lnunits = const_nunits / group_size;
+ = n * GET_MODE_BITSIZE (elmode);
+ unsigned int lnunits = const_nunits / n;
/* If we can't construct such a vector fall back to
element extracts from the original vector type and
element size stores. */
@@ -8551,7 +8563,7 @@ vectorizable_store (vec_info *vinfo,
!= CODE_FOR_nothing))
{
nstores = lnunits;
- lnel = group_size;
+ lnel = n;
ltype = build_nonstandard_integer_type (lsize, 1);
lvectype = build_vector_type (ltype, nstores);
}
@@ -8562,24 +8574,6 @@ vectorizable_store (vec_info *vinfo,
issue exists here for reasonable archs. */
}
}
- else if (group_size >= const_nunits
- && group_size % const_nunits == 0)
- {
- int mis_align = dr_misalignment (first_dr_info, vectype);
- dr_alignment_support dr_align
- = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
- mis_align);
- if (dr_align == dr_aligned
- || dr_align == dr_unaligned_supported)
- {
- nstores = 1;
- lnel = const_nunits;
- ltype = vectype;
- lvectype = vectype;
- alignment_support_scheme = dr_align;
- misalignment = mis_align;
- }
- }
ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
}
@@ -10364,34 +10358,32 @@ vectorizable_load (vec_info *vinfo,
auto_vec<tree> dr_chain;
if (memory_access_type == VMAT_STRIDED_SLP)
{
- if (group_size < const_nunits)
+ HOST_WIDE_INT n = gcd (group_size, const_nunits);
+ /* Use the target vector type if the group size is a multiple
+ of it. */
+ if (n == const_nunits)
+ {
+ nloads = 1;
+ lnel = const_nunits;
+ ltype = vectype;
+ }
+ /* Else use the biggest vector we can load the group without
+ accessing excess elements. */
+ else if (n > 1)
{
- /* First check if vec_init optab supports construction from vector
- elts directly. Otherwise avoid emitting a constructor of
- vector elements by performing the loads using an integer type
- of the same size, constructing a vector of those and then
- re-interpreting it as the original vector type. This avoids a
- huge runtime penalty due to the general inability to perform
- store forwarding from smaller stores to a larger load. */
tree ptype;
tree vtype
- = vector_vector_composition_type (vectype,
- const_nunits / group_size,
+ = vector_vector_composition_type (vectype, const_nunits / n,
&ptype);
if (vtype != NULL_TREE)
{
- nloads = const_nunits / group_size;
- lnel = group_size;
+ nloads = const_nunits / n;
+ lnel = n;
lvectype = vtype;
ltype = ptype;
}
}
- else
- {
- nloads = 1;
- lnel = const_nunits;
- ltype = vectype;
- }
+ /* Else fall back to the default element-wise access. */
ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
}
/* Load vector(1) scalar_type if it's 1 element-wise vectype. */