new file mode 100644
@@ -0,0 +1,88 @@
+/* { dg-require-effective-target mmap } */
+
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define COUNT 511
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+#define TYPE unsigned char
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void __attribute__((noipa)) foo(TYPE * __restrict x,
+ TYPE *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[3*i+0];
+ x[16*i+1] = y[3*i+1];
+ x[16*i+2] = y[3*i+2];
+ x[16*i+3] = y[3*i+0];
+ x[16*i+4] = y[3*i+1];
+ x[16*i+5] = y[3*i+2];
+ x[16*i+6] = y[3*i+0];
+ x[16*i+7] = y[3*i+1];
+ x[16*i+8] = y[3*i+2];
+ x[16*i+9] = y[3*i+0];
+ x[16*i+10] = y[3*i+1];
+ x[16*i+11] = y[3*i+2];
+ x[16*i+12] = y[3*i+0];
+ x[16*i+13] = y[3*i+1];
+ x[16*i+14] = y[3*i+2];
+ x[16*i+15] = y[3*i+0];
+ }
+}
+
+void __attribute__((noipa)) bar(TYPE * __restrict x,
+ TYPE *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[5*i+0];
+ x[16*i+1] = y[5*i+1];
+ x[16*i+2] = y[5*i+2];
+ x[16*i+3] = y[5*i+3];
+ x[16*i+4] = y[5*i+4];
+ x[16*i+5] = y[5*i+0];
+ x[16*i+6] = y[5*i+1];
+ x[16*i+7] = y[5*i+2];
+ x[16*i+8] = y[5*i+3];
+ x[16*i+9] = y[5*i+4];
+ x[16*i+10] = y[5*i+0];
+ x[16*i+11] = y[5*i+1];
+ x[16*i+12] = y[5*i+2];
+ x[16*i+13] = y[5*i+3];
+ x[16*i+14] = y[5*i+4];
+ x[16*i+15] = y[5*i+0];
+ }
+}
+
+TYPE x[COUNT * 16];
+
+int
+main (void)
+{
+ void *y;
+ TYPE *end_y;
+
+ y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (y == MAP_FAILED)
+ {
+ perror ("mmap");
+ return 1;
+ }
+
+ end_y = (TYPE *) ((char *) y + MMAP_SIZE);
+
+ foo (x, end_y - COUNT * 3, COUNT);
+ bar (x, end_y - COUNT * 5, COUNT);
+
+ return 0;
+}
+
+/* We always require a scalar epilogue here but we don't know which
+ targets support vector composition this way. */
new file mode 100644
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse4.1 -mno-avx -fdump-tree-vect-details" } */
+
+void __attribute__((noipa)) foo(unsigned char * __restrict x,
+ unsigned char *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[3*i+0];
+ x[16*i+1] = y[3*i+1];
+ x[16*i+2] = y[3*i+2];
+ x[16*i+3] = y[3*i+0];
+ x[16*i+4] = y[3*i+1];
+ x[16*i+5] = y[3*i+2];
+ x[16*i+6] = y[3*i+0];
+ x[16*i+7] = y[3*i+1];
+ x[16*i+8] = y[3*i+2];
+ x[16*i+9] = y[3*i+0];
+ x[16*i+10] = y[3*i+1];
+ x[16*i+11] = y[3*i+2];
+ x[16*i+12] = y[3*i+0];
+ x[16*i+13] = y[3*i+1];
+ x[16*i+14] = y[3*i+2];
+ x[16*i+15] = y[3*i+0];
+ }
+}
+
+void __attribute__((noipa)) bar(unsigned char * __restrict x,
+ unsigned char *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[5*i+0];
+ x[16*i+1] = y[5*i+1];
+ x[16*i+2] = y[5*i+2];
+ x[16*i+3] = y[5*i+3];
+ x[16*i+4] = y[5*i+4];
+ x[16*i+5] = y[5*i+0];
+ x[16*i+6] = y[5*i+1];
+ x[16*i+7] = y[5*i+2];
+ x[16*i+8] = y[5*i+3];
+ x[16*i+9] = y[5*i+4];
+ x[16*i+10] = y[5*i+0];
+ x[16*i+11] = y[5*i+1];
+ x[16*i+12] = y[5*i+2];
+ x[16*i+13] = y[5*i+3];
+ x[16*i+14] = y[5*i+4];
+ x[16*i+15] = y[5*i+0];
+ }
+}
+
+/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue loop" "vect"} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect"} } */
@@ -2151,11 +2151,24 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
nunits, &tem, &remain)
|| maybe_lt (remain + group_size, nunits)))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "peeling for gaps insufficient for "
- "access\n");
- return false;
+ /* But peeling a single scalar iteration is enough if
+ we can use the next power-of-two sized partial
+ access. */
+ unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
+ if (!nunits.is_constant (&cnunits)
+ || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
+ || ((cremain = remain.to_constant (), true)
+ && ((cpart_size = (1 << ceil_log2 (cremain))) != cnunits)
+ && vector_vector_composition_type
+ (vectype, cnunits / cpart_size,
+ &half_vtype) == NULL_TREE))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "peeling for gaps insufficient for "
+ "access\n");
+ return false;
+ }
}
/* If this is single-element interleaving with an element
@@ -11597,6 +11610,27 @@ vectorizable_load (vec_info *vinfo,
gcc_assert (new_vtype
|| LOOP_VINFO_PEELING_FOR_GAPS
(loop_vinfo));
+ /* But still reduce the access size to the next
+ required power-of-two so peeling a single
+ scalar iteration is sufficient. */
+ unsigned HOST_WIDE_INT cremain;
+ if (remain.is_constant (&cremain))
+ {
+ unsigned HOST_WIDE_INT cpart_size
+ = 1 << ceil_log2 (cremain);
+ if (known_gt (nunits, cpart_size)
+ && constant_multiple_p (nunits, cpart_size,
+ &num))
+ {
+ tree ptype;
+ new_vtype
+ = vector_vector_composition_type (vectype,
+ num,
+ &ptype);
+ if (new_vtype)
+ ltype = ptype;
+ }
+ }
}
}
tree offset