Message ID | mptczbv4udm.fsf@arm.com |
---|---|
State | New |
Headers | show |
Series | vect: Fix SLP layout handling of masked loads [PR106794] | expand |
On Fri, 16 Sep 2022, Richard Sandiford wrote: > PR106794 shows that I'd forgotten about masked loads when > doing the SLP layout changes. These loads can't currently > be permuted independently of their mask input, so during > construction they never get a load permutation. > > (If we did support permuting masked loads in future, the mask > would need to be in the right order for the load, rather than in > the order implied by the result of the permutation. Since masked > loads can't be partly or fully scalarised in the way that normal > permuted loads can be, there's probably no benefit to fusing the > permutation and the load. Permutation after the fact is probably > good enough.) > > Tested on aarch64-linux-gnu & x86_64-linux-gnu. OK to install? OK. Thanks, Richard. > Richard > > > gcc/ > PR tree-optimization/106794 > PR tree-optimization/106914 > * tree-vect-slp.cc (vect_optimize_slp_pass::internal_node_cost): > Only consider loads that already have a permutation. > (vect_optimize_slp_pass::start_choosing_layouts): Assert that > loads with permutations are leaf nodes. Prevent any kind of grouped > access from changing layout if it doesn't have a load permutation. > > gcc/testsuite/ > * gcc.dg/vect/pr106914.c: New test. > * g++.dg/vect/pr106794.cc: Likewise. > --- > gcc/testsuite/g++.dg/vect/pr106794.cc | 40 +++++++++++++++++++++++++++ > gcc/testsuite/gcc.dg/vect/pr106914.c | 15 ++++++++++ > gcc/tree-vect-slp.cc | 30 ++++++++++++++------ > 3 files changed, 77 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/g++.dg/vect/pr106794.cc > create mode 100644 gcc/testsuite/gcc.dg/vect/pr106914.c > > diff --git a/gcc/testsuite/g++.dg/vect/pr106794.cc b/gcc/testsuite/g++.dg/vect/pr106794.cc > new file mode 100644 > index 00000000000..f056563c4e1 > --- /dev/null > +++ b/gcc/testsuite/g++.dg/vect/pr106794.cc > @@ -0,0 +1,40 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-Ofast" } */ > +/* { dg-additional-options "-march=bdver2" { target x86_64-*-* i?86-*-* } } */ > + > +template <class T> struct Vector3 { > + Vector3(); > + Vector3(T, T, T); > + T length() const; > + T x, y, z; > +}; > +template <class T> > +Vector3<T>::Vector3(T _x, T _y, T _z) : x(_x), y(_y), z(_z) {} > +Vector3<float> cross(Vector3<float> a, Vector3<float> b) { > + return Vector3<float>(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, > + a.x * b.y - a.y * b.x); > +} > +template <class T> T Vector3<T>::length() const { return z; } > +int generateNormals_i; > +float generateNormals_p2_0, generateNormals_p0_0; > +struct SphereMesh { > + void generateNormals(); > + float vertices; > +}; > +void SphereMesh::generateNormals() { > + Vector3<float> *faceNormals = new Vector3<float>; > + for (int j; j; j++) { > + float *p0 = &vertices + 3, *p1 = &vertices + j * 3, *p2 = &vertices + 3, > + *p3 = &vertices + generateNormals_i + j * 3; > + Vector3<float> v0(p1[0] - generateNormals_p0_0, p1[1] - 1, p1[2] - 2), > + v1(0, 1, 2); > + if (v0.length()) > + v1 = Vector3<float>(p3[0] - generateNormals_p2_0, p3[1] - p2[1], > + p3[2] - p2[2]); > + else > + v1 = Vector3<float>(generateNormals_p0_0 - p3[0], p0[1] - p3[1], > + p0[2] - p3[2]); > + Vector3<float> faceNormal = cross(v0, v1); > + faceNormals[j] = faceNormal; > + } > +} > diff --git a/gcc/testsuite/gcc.dg/vect/pr106914.c b/gcc/testsuite/gcc.dg/vect/pr106914.c > new file mode 100644 > index 00000000000..9d9b3e30081 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr106914.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-fprofile-generate" } */ > +/* { dg-additional-options "-mavx512vl" { target x86_64-*-* i?86-*-* } } */ > + > +int *mask_slp_int64_t_8_2_x, *mask_slp_int64_t_8_2_y, *mask_slp_int64_t_8_2_z; > + > +void > +__attribute__mask_slp_int64_t_8_2() { > + for (int i; i; i += 8) { > + mask_slp_int64_t_8_2_x[i + 6] = > + mask_slp_int64_t_8_2_y[i + 6] ? mask_slp_int64_t_8_2_z[i] : 1; > + mask_slp_int64_t_8_2_x[i + 7] = > + mask_slp_int64_t_8_2_y[i + 7] ? mask_slp_int64_t_8_2_z[i + 7] : 2; > + } > +} > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index ca3422c2a1e..229f2663ebc 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -4494,7 +4494,8 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i, > stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node); > if (rep > && STMT_VINFO_DATA_REF (rep) > - && DR_IS_READ (STMT_VINFO_DATA_REF (rep))) > + && DR_IS_READ (STMT_VINFO_DATA_REF (rep)) > + && SLP_TREE_LOAD_PERMUTATION (node).exists ()) > { > auto_load_permutation_t tmp_perm; > tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node)); > @@ -4569,8 +4570,12 @@ vect_optimize_slp_pass::start_choosing_layouts () > if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) > { > /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node > - unpermuted, record a layout that reverses this permutation. */ > - gcc_assert (partition.layout == 0); > + unpermuted, record a layout that reverses this permutation. > + > + We would need more work to cope with loads that are internally > + permuted and also have inputs (such as masks for > + IFN_MASK_LOADs). */ > + gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ); > if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt)) > continue; > dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt); > @@ -4684,12 +4689,21 @@ vect_optimize_slp_pass::start_choosing_layouts () > vertex.weight = vect_slp_node_weight (node); > > /* We do not handle stores with a permutation, so all > - incoming permutations must have been materialized. */ > + incoming permutations must have been materialized. > + > + We also don't handle masked grouped loads, which lack a > + permutation vector. In this case the memory locations > + form an implicit second input to the loads, on top of the > + explicit mask input, and the memory input's layout cannot > + be changed. > + > + On the other hand, we do support permuting gather loads and > + masked gather loads, where each scalar load is independent > + of the others. This can be useful if the address/index input > + benefits from permutation. */ > if (STMT_VINFO_DATA_REF (rep) > - && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep))) > - /* ??? We're forcing materialization in place > - of the child here, we'd need special handling > - in materialization to leave layout -1 here. */ > + && STMT_VINFO_GROUPED_ACCESS (rep) > + && !SLP_TREE_LOAD_PERMUTATION (node).exists ()) > partition.layout = 0; > > /* We cannot change the layout of an operation that is > -- > 2.25.1 > >
diff --git a/gcc/testsuite/g++.dg/vect/pr106794.cc b/gcc/testsuite/g++.dg/vect/pr106794.cc new file mode 100644 index 00000000000..f056563c4e1 --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/pr106794.cc @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-Ofast" } */ +/* { dg-additional-options "-march=bdver2" { target x86_64-*-* i?86-*-* } } */ + +template <class T> struct Vector3 { + Vector3(); + Vector3(T, T, T); + T length() const; + T x, y, z; +}; +template <class T> +Vector3<T>::Vector3(T _x, T _y, T _z) : x(_x), y(_y), z(_z) {} +Vector3<float> cross(Vector3<float> a, Vector3<float> b) { + return Vector3<float>(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, + a.x * b.y - a.y * b.x); +} +template <class T> T Vector3<T>::length() const { return z; } +int generateNormals_i; +float generateNormals_p2_0, generateNormals_p0_0; +struct SphereMesh { + void generateNormals(); + float vertices; +}; +void SphereMesh::generateNormals() { + Vector3<float> *faceNormals = new Vector3<float>; + for (int j; j; j++) { + float *p0 = &vertices + 3, *p1 = &vertices + j * 3, *p2 = &vertices + 3, + *p3 = &vertices + generateNormals_i + j * 3; + Vector3<float> v0(p1[0] - generateNormals_p0_0, p1[1] - 1, p1[2] - 2), + v1(0, 1, 2); + if (v0.length()) + v1 = Vector3<float>(p3[0] - generateNormals_p2_0, p3[1] - p2[1], + p3[2] - p2[2]); + else + v1 = Vector3<float>(generateNormals_p0_0 - p3[0], p0[1] - p3[1], + p0[2] - p3[2]); + Vector3<float> faceNormal = cross(v0, v1); + faceNormals[j] = faceNormal; + } +} diff --git a/gcc/testsuite/gcc.dg/vect/pr106914.c b/gcc/testsuite/gcc.dg/vect/pr106914.c new file mode 100644 index 00000000000..9d9b3e30081 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr106914.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-fprofile-generate" } */ +/* { dg-additional-options "-mavx512vl" { target x86_64-*-* i?86-*-* } } */ + +int *mask_slp_int64_t_8_2_x, *mask_slp_int64_t_8_2_y, *mask_slp_int64_t_8_2_z; + +void +__attribute__mask_slp_int64_t_8_2() { + for (int i; i; i += 8) { + mask_slp_int64_t_8_2_x[i + 6] = + mask_slp_int64_t_8_2_y[i + 6] ? mask_slp_int64_t_8_2_z[i] : 1; + mask_slp_int64_t_8_2_x[i + 7] = + mask_slp_int64_t_8_2_y[i + 7] ? mask_slp_int64_t_8_2_z[i + 7] : 2; + } +} diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index ca3422c2a1e..229f2663ebc 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -4494,7 +4494,8 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i, stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node); if (rep && STMT_VINFO_DATA_REF (rep) - && DR_IS_READ (STMT_VINFO_DATA_REF (rep))) + && DR_IS_READ (STMT_VINFO_DATA_REF (rep)) + && SLP_TREE_LOAD_PERMUTATION (node).exists ()) { auto_load_permutation_t tmp_perm; tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node)); @@ -4569,8 +4570,12 @@ vect_optimize_slp_pass::start_choosing_layouts () if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) { /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node - unpermuted, record a layout that reverses this permutation. */ - gcc_assert (partition.layout == 0); + unpermuted, record a layout that reverses this permutation. + + We would need more work to cope with loads that are internally + permuted and also have inputs (such as masks for + IFN_MASK_LOADs). */ + gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ); if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt)) continue; dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt); @@ -4684,12 +4689,21 @@ vect_optimize_slp_pass::start_choosing_layouts () vertex.weight = vect_slp_node_weight (node); /* We do not handle stores with a permutation, so all - incoming permutations must have been materialized. */ + incoming permutations must have been materialized. + + We also don't handle masked grouped loads, which lack a + permutation vector. In this case the memory locations + form an implicit second input to the loads, on top of the + explicit mask input, and the memory input's layout cannot + be changed. + + On the other hand, we do support permuting gather loads and + masked gather loads, where each scalar load is independent + of the others. This can be useful if the address/index input + benefits from permutation. */ if (STMT_VINFO_DATA_REF (rep) - && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep))) - /* ??? We're forcing materialization in place - of the child here, we'd need special handling - in materialization to leave layout -1 here. */ + && STMT_VINFO_GROUPED_ACCESS (rep) + && !SLP_TREE_LOAD_PERMUTATION (node).exists ()) partition.layout = 0; /* We cannot change the layout of an operation that is