@@ -1,54 +1,55 @@
-#include "tree-vect.h"
-
-#if VECTOR_BITS > 256
-#define NINTS (VECTOR_BITS / 32)
-#else
-#define NINTS 8
-#endif
-
-#define N (NINTS * 2)
-#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
-
-extern void abort (void);
-
-typedef struct giga
-{
- unsigned int g[N];
-} giga;
-
-unsigned long __attribute__((noinline,noclone))
-addfst(giga const *gptr, int num)
-{
- unsigned int retval = 0;
- int i;
- for (i = 0; i < num; i++)
- retval += gptr[i].g[0];
- return retval;
-}
-
-int main ()
-{
- struct giga g[NINTS];
- unsigned int n = 1;
- int i, j;
- check_vect ();
- for (i = 0; i < NINTS; ++i)
- for (j = 0; j < N; ++j)
- {
- g[i].g[j] = n++;
- __asm__ volatile ("");
- }
- if (addfst (g, NINTS) != RESULT)
- abort ();
- return 0;
-}
-
-/* We don't want to vectorize the single-element interleaving in the way
- we currently do that (without ignoring not needed vectors in the
- gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
- sub-optimal and causes memory explosion (even though the cost model
- should reject that in the end). */
-
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */
-/* We end up using gathers for the strided load on RISC-V which would be OK. */
-/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */
+#include "tree-vect.h"
+
+#if VECTOR_BITS > 256
+#define NINTS (VECTOR_BITS / 32)
+#else
+#define NINTS 8
+#endif
+
+#define N (NINTS * 2)
+#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
+
+extern void abort (void);
+
+typedef struct giga
+{
+ unsigned int g[N];
+} giga;
+
+unsigned long __attribute__((noinline,noclone))
+addfst(giga const *gptr, int num)
+{
+ unsigned int retval = 0;
+ int i;
+ for (i = 0; i < num; i++)
+ retval += gptr[i].g[0];
+ return retval;
+}
+
+int main ()
+{
+ struct giga g[NINTS];
+ unsigned int n = 1;
+ int i, j;
+ check_vect ();
+ for (i = 0; i < NINTS; ++i)
+ for (j = 0; j < N; ++j)
+ {
+ g[i].g[j] = n++;
+ __asm__ volatile ("");
+ }
+ if (addfst (g, NINTS) != RESULT)
+ abort ();
+ return 0;
+}
+
+/* We don't want to vectorize the single-element interleaving in the way
+ we currently do that (without ignoring not needed vectors in the
+ gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
+ sub-optimal and causes memory explosion (even though the cost model
+ should reject that in the end). */
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */
+/* We should end up using gathers for the strided load on RISC-V. */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" { target { riscv*-*-* } } } } */
+/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */
@@ -2081,6 +2081,35 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
else
*memory_access_type = VMAT_CONTIGUOUS;
+ /* If this is single-element interleaving with an element
+ distance that leaves unused vector loads around punt - we
+ at least create very sub-optimal code in that case (and
+ blow up memory, see PR65518). */
+ if (loop_vinfo
+ && *memory_access_type == VMAT_CONTIGUOUS
+ && single_element_p
+ && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+ {
+ if (SLP_TREE_LANES (slp_node) == 1)
+ {
+ *memory_access_type = VMAT_ELEMENTWISE;
+ overrun_p = false;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving not supported "
+ "for not adjacent vector loads, using "
+ "elementwise access\n");
+ }
+ else
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving not supported "
+ "for not adjacent vector loads\n");
+ return false;
+ }
+ }
+
overrun_p = loop_vinfo && gap != 0;
if (overrun_p && vls_type != VLS_LOAD)
{
@@ -2149,6 +2178,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
"Peeling for outer loop is not supported\n");
return false;
}
+
/* Peeling for gaps assumes that a single scalar iteration
is enough to make sure the last vector iteration doesn't
access excess elements. */
@@ -2179,34 +2209,6 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
return false;
}
}
-
- /* If this is single-element interleaving with an element
- distance that leaves unused vector loads around punt - we
- at least create very sub-optimal code in that case (and
- blow up memory, see PR65518). */
- if (loop_vinfo
- && *memory_access_type == VMAT_CONTIGUOUS
- && single_element_p
- && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
- {
- if (SLP_TREE_LANES (slp_node) == 1)
- {
- *memory_access_type = VMAT_ELEMENTWISE;
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "single-element interleaving not supported "
- "for not adjacent vector loads, using "
- "elementwise access\n");
- }
- else
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "single-element interleaving not supported "
- "for not adjacent vector loads\n");
- return false;
- }
- }
}
}
else