@@ -22988,6 +22988,46 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
return default_noce_conversion_profitable_p (seq, if_info);
}
+/* Return true if REF may have STF issue, otherwise false.
+ Any unaligned_load from parm_decl which is passed by stack
+ is considered to have STLF stall issue. */
+static bool
+ix86_load_maybe_stfs_p (data_reference* dr)
+{
+ tree addr = DR_BASE_ADDRESS (dr);
+ if (TREE_CODE (addr) != ADDR_EXPR)
+ return false;
+ addr = get_base_address (TREE_OPERAND (addr, 0));
+
+ if (TREE_CODE (addr) != PARM_DECL)
+ return false;
+ tree type = TREE_TYPE (addr);
+ if (!type)
+ return false;
+
+ machine_mode mode = TYPE_MODE (type);
+
+ /* There could be false positive in determine parameter passed by stack.
+ .i.e. parameter can be put in registers but finally passed by stack
+ because registers are ran out. */
+ if (TARGET_64BIT)
+ {
+ /* From function_arg_64. */
+ enum x86_64_reg_class regclass[MAX_CLASSES];
+ int zero_width_bitfields = 0;
+ return !classify_argument (mode, type, regclass, 0, zero_width_bitfields);
+ }
+ else
+ {
+ /* From function_arg_32. */
+ return (mode == E_BLKmode
+ || (AGGREGATE_TYPE_P (type)
+ && (VECTOR_MODE_P (mode) || mode == TImode)));
+ }
+
+ return false;
+}
+
/* x86-specific vector costs. */
class ix86_vector_costs : public vector_costs
{
@@ -23218,6 +23258,17 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
if (stmt_cost == -1)
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
+ Performance may lose when there's no STF issue(1 vector_load vs n
+ scalar_load + CTOR).
+ TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
+ tuned. */
+ if (kind == unaligned_load && stmt_info
+ && stmt_info->slp_type == pure_slp
+ && STMT_VINFO_DATA_REF (stmt_info)
+ && ix86_load_maybe_stfs_p (STMT_VINFO_DATA_REF (stmt_info)))
+ stmt_cost += COSTS_N_INSNS (ix86_cost->stfs / 2);
+
/* Penalize DFmode vector operations for Bonnell. */
if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
&& vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
@@ -168,6 +168,7 @@ struct processor_costs {
in 32bit, 64bit, 128bit, 256bit and 512bit */
const int sse_unaligned_load[5];/* cost of unaligned load. */
const int sse_unaligned_store[5];/* cost of unaligned store. */
+ const int stfs; /* cost of store forward stalls. */
const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */
zmm_move;
const int sse_to_integer; /* cost of moving SSE register to integer. */
@@ -100,6 +100,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
in 128bit, 256bit and 512bit */
{3, 3, 3, 3, 3}, /* cost of unaligned SSE store
in 128bit, 256bit and 512bit */
+ 6, /* cost of store forward stall. */
3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
5, 0, /* Gather load static, per_elt. */
@@ -209,6 +210,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
+ 8, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
@@ -317,6 +319,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
+ 8, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
@@ -427,6 +430,7 @@ struct processor_costs pentium_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
+ 8, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
@@ -528,6 +532,7 @@ struct processor_costs lakemont_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
+ 8, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
@@ -644,6 +649,7 @@ struct processor_costs pentiumpro_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
+ 24, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
@@ -751,6 +757,7 @@ struct processor_costs geode_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{2, 2, 8, 16, 32}, /* cost of unaligned loads. */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
+ 14, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
2, 2, /* Gather load static, per_elt. */
@@ -858,6 +865,7 @@ struct processor_costs k6_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{2, 2, 8, 16, 32}, /* cost of unaligned loads. */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
+ 24, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
2, 2, /* Gather load static, per_elt. */
@@ -971,6 +979,7 @@ struct processor_costs athlon_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 4, 12, 12, 24}, /* cost of unaligned loads. */
{4, 4, 10, 10, 20}, /* cost of unaligned stores. */
+ 14, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
5, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
@@ -1086,6 +1095,7 @@ struct processor_costs k8_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 3, 12, 12, 24}, /* cost of unaligned loads. */
{4, 4, 10, 10, 20}, /* cost of unaligned stores. */
+ 14, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
5, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
@@ -1214,6 +1224,7 @@ struct processor_costs amdfam10_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 4, 3, 7, 12}, /* cost of unaligned loads. */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
+ 21, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
@@ -1334,6 +1345,7 @@ const struct processor_costs bdver_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{12, 12, 10, 40, 60}, /* cost of unaligned loads. */
{10, 10, 10, 40, 60}, /* cost of unaligned stores. */
+ 54, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
16, /* cost of moving SSE register to integer. */
12, 12, /* Gather load static, per_elt. */
@@ -1475,6 +1487,7 @@ struct processor_costs znver1_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 12, 24}, /* cost of unaligned loads. */
{8, 8, 8, 16, 32}, /* cost of unaligned stores. */
+ 42, /* cost of store forward stall. */
2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
6, /* cost of moving SSE register to integer. */
/* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
@@ -1630,6 +1643,7 @@ struct processor_costs znver2_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 6, 12}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
+ 42, /* cost of store forward stall. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
@@ -1762,6 +1776,7 @@ struct processor_costs znver3_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 6, 12}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
+ 42, /* cost of store forward stall. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
@@ -1907,6 +1922,7 @@ struct processor_costs skylake_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 10, 20}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
+ 26, /* cost of store forward stall. */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
20, 8, /* Gather load static, per_elt. */
@@ -2033,6 +2049,7 @@ struct processor_costs icelake_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 10, 20}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
+ 26, /* cost of store forward stall. */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
20, 8, /* Gather load static, per_elt. */
@@ -2153,6 +2170,7 @@ struct processor_costs alderlake_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 10, 15}, /* cost of unaligned loads. */
{6, 6, 6, 10, 15}, /* cost of unaligned storess. */
+ 90, /* cost of store forward stall. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
18, 6, /* Gather load static, per_elt. */
@@ -2266,6 +2284,7 @@ const struct processor_costs btver1_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{10, 10, 12, 48, 96}, /* cost of unaligned loads. */
{10, 10, 12, 48, 96}, /* cost of unaligned stores. */
+ 36, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
14, /* cost of moving SSE register to integer. */
10, 10, /* Gather load static, per_elt. */
@@ -2376,6 +2395,7 @@ const struct processor_costs btver2_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{10, 10, 12, 48, 96}, /* cost of unaligned loads. */
{10, 10, 12, 48, 96}, /* cost of unaligned stores. */
+ 36, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
14, /* cost of moving SSE register to integer. */
10, 10, /* Gather load static, per_elt. */
@@ -2485,6 +2505,7 @@ struct processor_costs pentium4_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{32, 32, 32, 64, 128}, /* cost of unaligned loads. */
{32, 32, 32, 64, 128}, /* cost of unaligned stores. */
+ 10, /* cost of store forward stall. */
12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
20, /* cost of moving SSE register to integer. */
16, 16, /* Gather load static, per_elt. */
@@ -2597,6 +2618,7 @@ struct processor_costs nocona_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{24, 24, 24, 48, 96}, /* cost of unaligned loads. */
{24, 24, 24, 48, 96}, /* cost of unaligned stores. */
+ 8, /* cost of store forward stall. */
6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
20, /* cost of moving SSE register to integer. */
12, 12, /* Gather load static, per_elt. */
@@ -2707,6 +2729,7 @@ struct processor_costs atom_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{16, 16, 16, 32, 64}, /* cost of unaligned loads. */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
+ 32, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
8, /* cost of moving SSE register to integer. */
8, 8, /* Gather load static, per_elt. */
@@ -2817,6 +2840,7 @@ struct processor_costs slm_cost = {
in SImode, DImode and TImode. */
{16, 16, 16, 32, 64}, /* cost of unaligned loads. */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
+ 48, /* cost of store forward stall. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
8, /* cost of moving SSE register to integer. */
8, 8, /* Gather load static, per_elt. */
@@ -2939,6 +2963,7 @@ struct processor_costs tremont_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 10, 15}, /* cost of unaligned loads. */
{6, 6, 6, 10, 15}, /* cost of unaligned storess. */
+ 42, /* cost of store forward stall. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
18, 6, /* Gather load static, per_elt. */
@@ -3051,6 +3076,7 @@ struct processor_costs intel_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{10, 10, 10, 10, 10}, /* cost of unaligned loads. */
{10, 10, 10, 10, 10}, /* cost of unaligned loads. */
+ 22, /* cost of store forward stall. */
2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
4, /* cost of moving SSE register to integer. */
6, 6, /* Gather load static, per_elt. */
@@ -3168,6 +3194,7 @@ struct processor_costs generic_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 10, 15}, /* cost of unaligned loads. */
{6, 6, 6, 10, 15}, /* cost of unaligned storess. */
+ 54, /* cost of store forward stall. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
18, 6, /* Gather load static, per_elt. */
@@ -3291,6 +3318,7 @@ struct processor_costs core_cost = {
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 6, 12}, /* cost of unaligned loads. */
{6, 6, 6, 6, 12}, /* cost of unaligned stores. */
+ 26, /* cost of store forward stall. */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2, /* cost of moving SSE register to integer. */
/* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+ return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+ return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
new file mode 100644
@@ -0,0 +1,90 @@
+/* PR target/101908. */
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -O2 -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not "add new stmt:.*MEM \<vector(2) double\>.*ray + 24B" "slp2" } } */
+/* This testcase is used to avoid STLF stall. */
+
+#define sqrt __builtin_sqrt
+#define SQ(x) ((x) * (x))
+struct vec3 {
+ double x, y, z;
+};
+
+struct ray {
+ struct vec3 orig, dir;
+};
+
+struct material {
+ struct vec3 col; /* color */
+ double spow; /* specular power */
+ double refl; /* reflection intensity */
+};
+
+struct sphere {
+ struct vec3 pos;
+ double rad;
+ struct material mat;
+ struct sphere *next;
+};
+
+struct spoint {
+ struct vec3 pos, normal, vref; /* position, normal and view reflection */
+ double dist; /* parametric distance of intersection along the ray */
+};
+
+#define ERR_MARGIN 1e-6
+
+#define DOT(a, b) ((a).x * (b).x + (a).y * (b).y + (a).z * (b).z)
+#define NORMALIZE(a) do { \
+ double len = sqrt(DOT(a, a)); \
+ (a).x /= len; (a).y /= len; (a).z /= len; \
+ } while(0);
+
+static struct vec3
+reflect(struct vec3 v, struct vec3 n) {
+ struct vec3 res;
+ double dot = v.x * n.x + v.y * n.y + v.z * n.z;
+ res.x = -(2.0 * dot * n.x - v.x);
+ res.y = -(2.0 * dot * n.y - v.y);
+ res.z = -(2.0 * dot * n.z - v.z);
+ return res;
+}
+
+int ray_sphere(const struct sphere *sph,
+ struct ray ray, struct spoint *sp) {
+ double a, b, c, d, sqrt_d, t1, t2;
+
+ a = SQ(ray.dir.x) + SQ(ray.dir.y) + SQ(ray.dir.z);
+ b = 2.0 * ray.dir.x * (ray.orig.x - sph->pos.x) +
+ 2.0 * ray.dir.y * (ray.orig.y - sph->pos.y) +
+ 2.0 * ray.dir.z * (ray.orig.z - sph->pos.z);
+ c = SQ(sph->pos.x) + SQ(sph->pos.y) + SQ(sph->pos.z) +
+ SQ(ray.orig.x) + SQ(ray.orig.y) + SQ(ray.orig.z) +
+ 2.0 * (-sph->pos.x * ray.orig.x - sph->pos.y * ray.orig.y - sph->pos.z * ray.orig.z) - SQ(sph->rad);
+
+ if((d = SQ(b) - 4.0 * a * c) < 0.0) return 0;
+
+ sqrt_d = sqrt(d);
+ t1 = (-b + sqrt_d) / (2.0 * a);
+ t2 = (-b - sqrt_d) / (2.0 * a);
+
+ if((t1 < ERR_MARGIN && t2 < ERR_MARGIN) || (t1 > 1.0 && t2 > 1.0)) return 0;
+
+ if(sp) {
+ if(t1 < ERR_MARGIN) t1 = t2;
+ if(t2 < ERR_MARGIN) t2 = t1;
+ sp->dist = t1 < t2 ? t1 : t2;
+
+ sp->pos.x = ray.orig.x + ray.dir.x * sp->dist;
+ sp->pos.y = ray.orig.y + ray.dir.y * sp->dist;
+ sp->pos.z = ray.orig.z + ray.dir.z * sp->dist;
+
+ sp->normal.x = (sp->pos.x - sph->pos.x) / sph->rad;
+ sp->normal.y = (sp->pos.y - sph->pos.y) / sph->rad;
+ sp->normal.z = (sp->pos.z - sph->pos.z) / sph->rad;
+
+ sp->vref = reflect(ray.dir, sp->normal);
+ NORMALIZE(sp->vref);
+ }
+ return 1;
+}
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v16qi.c"
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+ p[0] = x.a[1] + y.a[1];
+ p[1] = x.a[2] + y.a[2];
+ p[2] = x.a[3] + y.a[3];
+ p[3] = x.a[4] + y.a[4];
+ p[4] = x.a[5] + y.a[5];
+ p[5] = x.a[6] + y.a[6];
+ p[6] = x.a[7] + y.a[7];
+ p[7] = x.a[8] + y.a[8];
+ p[8] = x.a[9] + y.a[9];
+ p[9] = x.a[10] + y.a[10];
+ p[10] = x.a[11] + y.a[11];
+ p[11] = x.a[12] + y.a[12];
+ p[12] = x.a[13] + y.a[13];
+ p[13] = x.a[14] + y.a[14];
+ p[14] = x.a[15] + y.a[15];
+ p[15] = x.a[16] + y.a[16];
+}
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v16qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v16qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v2qi.c"
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v2qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v2qi.c"
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+ p[14] = x.a[15] + y.a[15];
+ p[15] = x.a[16] + y.a[16];
+}
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v2qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v2qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v4qi.c"
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v4qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v4qi.c"
new file mode 100644
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+ p[12] = x.a[13] + y.a[13];
+ p[13] = x.a[14] + y.a[14];
+ p[14] = x.a[15] + y.a[15];
+ p[15] = x.a[16] + y.a[16];
+}
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v4qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v4qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v8qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v8qi.c"
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v8qi.c"
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v8qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v8qi-adl.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v8qi.c"
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+ p[8] = x.a[9] + y.a[9];
+ p[9] = x.a[10] + y.a[10];
+ p[10] = x.a[11] + y.a[11];
+ p[11] = x.a[12] + y.a[12];
+ p[12] = x.a[13] + y.a[13];
+ p[13] = x.a[14] + y.a[14];
+ p[14] = x.a[15] + y.a[15];
+ p[15] = x.a[16] + y.a[16];
+}
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+ p[8] = x.a[9] + y.a[9];
+ p[9] = x.a[10] + y.a[10];
+ p[10] = x.a[11] + y.a[11];
+ p[11] = x.a[12] + y.a[12];
+ p[12] = x.a[13] + y.a[13];
+ p[13] = x.a[14] + y.a[14];
+ p[14] = x.a[15] + y.a[15];
+ p[15] = x.a[16] + y.a[16];
+}
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v8qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v8qi.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v8qi-adl.c"
new file mode 100644
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v8qi.c"