new file mode 100644
@@ -0,0 +1,88 @@
+/* { dg-require-effective-target mmap } */
+
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define COUNT 511
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+#define TYPE unsigned char
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void __attribute__((noipa)) foo(TYPE * __restrict x,
+ TYPE *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[3*i+0];
+ x[16*i+1] = y[3*i+1];
+ x[16*i+2] = y[3*i+2];
+ x[16*i+3] = y[3*i+0];
+ x[16*i+4] = y[3*i+1];
+ x[16*i+5] = y[3*i+2];
+ x[16*i+6] = y[3*i+0];
+ x[16*i+7] = y[3*i+1];
+ x[16*i+8] = y[3*i+2];
+ x[16*i+9] = y[3*i+0];
+ x[16*i+10] = y[3*i+1];
+ x[16*i+11] = y[3*i+2];
+ x[16*i+12] = y[3*i+0];
+ x[16*i+13] = y[3*i+1];
+ x[16*i+14] = y[3*i+2];
+ x[16*i+15] = y[3*i+0];
+ }
+}
+
+void __attribute__((noipa)) bar(TYPE * __restrict x,
+ TYPE *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[5*i+0];
+ x[16*i+1] = y[5*i+1];
+ x[16*i+2] = y[5*i+2];
+ x[16*i+3] = y[5*i+3];
+ x[16*i+4] = y[5*i+4];
+ x[16*i+5] = y[5*i+0];
+ x[16*i+6] = y[5*i+1];
+ x[16*i+7] = y[5*i+2];
+ x[16*i+8] = y[5*i+3];
+ x[16*i+9] = y[5*i+4];
+ x[16*i+10] = y[5*i+0];
+ x[16*i+11] = y[5*i+1];
+ x[16*i+12] = y[5*i+2];
+ x[16*i+13] = y[5*i+3];
+ x[16*i+14] = y[5*i+4];
+ x[16*i+15] = y[5*i+0];
+ }
+}
+
+TYPE x[COUNT * 16];
+
+int
+main (void)
+{
+ void *y;
+ TYPE *end_y;
+
+ y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (y == MAP_FAILED)
+ {
+ perror ("mmap");
+ return 1;
+ }
+
+ end_y = (TYPE *) ((char *) y + MMAP_SIZE);
+
+ foo (x, end_y - COUNT * 3, COUNT);
+ bar (x, end_y - COUNT * 5, COUNT);
+
+ return 0;
+}
+
+/* We always require a scalar epilogue here but we don't know which
+ targets support vector composition this way. */
new file mode 100644
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse4.1 -mno-avx -fdump-tree-vect-details" } */
+
+void __attribute__((noipa)) foo(unsigned char * __restrict x,
+ unsigned char *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[3*i+0];
+ x[16*i+1] = y[3*i+1];
+ x[16*i+2] = y[3*i+2];
+ x[16*i+3] = y[3*i+0];
+ x[16*i+4] = y[3*i+1];
+ x[16*i+5] = y[3*i+2];
+ x[16*i+6] = y[3*i+0];
+ x[16*i+7] = y[3*i+1];
+ x[16*i+8] = y[3*i+2];
+ x[16*i+9] = y[3*i+0];
+ x[16*i+10] = y[3*i+1];
+ x[16*i+11] = y[3*i+2];
+ x[16*i+12] = y[3*i+0];
+ x[16*i+13] = y[3*i+1];
+ x[16*i+14] = y[3*i+2];
+ x[16*i+15] = y[3*i+0];
+ }
+}
+
+void __attribute__((noipa)) bar(unsigned char * __restrict x,
+ unsigned char *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[16*i+0] = y[5*i+0];
+ x[16*i+1] = y[5*i+1];
+ x[16*i+2] = y[5*i+2];
+ x[16*i+3] = y[5*i+3];
+ x[16*i+4] = y[5*i+4];
+ x[16*i+5] = y[5*i+0];
+ x[16*i+6] = y[5*i+1];
+ x[16*i+7] = y[5*i+2];
+ x[16*i+8] = y[5*i+3];
+ x[16*i+9] = y[5*i+4];
+ x[16*i+10] = y[5*i+0];
+ x[16*i+11] = y[5*i+1];
+ x[16*i+12] = y[5*i+2];
+ x[16*i+13] = y[5*i+3];
+ x[16*i+14] = y[5*i+4];
+ x[16*i+15] = y[5*i+0];
+ }
+}
+
+/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue loop" "vect"} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect"} } */
@@ -2142,7 +2142,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
"Peeling for outer loop is not supported\n");
return false;
}
- unsigned HOST_WIDE_INT cnunits, cvf;
+ unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
if (overrun_p
&& (!nunits.is_constant (&cnunits)
|| !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
@@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
access excess elements.
??? Enhancements include peeling multiple iterations
or using masked loads with a static mask. */
- || (group_size * cvf) % cnunits + group_size - gap < cnunits))
+ || ((group_size * cvf) % cnunits + group_size - gap < cnunits
+ /* But peeling a single scalar iteration is enough if
+ we can use the next power-of-two sized partial
+ access. */
+ && ((cremain = (group_size * cvf - gap) % cnunits), true
+ && ((cpart_size = (1 << ceil_log2 (cremain)))
+ != cnunits)
+ && vector_vector_composition_type
+ (vectype, cnunits / cpart_size,
+ &half_vtype) == NULL_TREE))))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo,
gcc_assert (new_vtype
|| LOOP_VINFO_PEELING_FOR_GAPS
(loop_vinfo));
+ /* But still reduce the access size to the next
+ required power-of-two so peeling a single
+ scalar iteration is sufficient. */
+ unsigned HOST_WIDE_INT cremain;
+ if (remain.is_constant (&cremain))
+ {
+ unsigned HOST_WIDE_INT cpart_size
+ = 1 << ceil_log2 (cremain);
+ if (known_gt (nunits, cpart_size)
+ && constant_multiple_p (nunits, cpart_size,
+ &num))
+ {
+ tree ptype;
+ new_vtype
+ = vector_vector_composition_type (vectype,
+ num,
+ &ptype);
+ if (new_vtype)
+ ltype = ptype;
+ }
+ }
}
}
tree offset