Message ID | 1292010331-20221-1-git-send-email-sebpop@gmail.com |
---|---|
State | New |
Headers | show |
On Fri, Dec 10, 2010 at 13:45, Sebastian Pop <sebpop@gmail.com> wrote: > Hi, > > here is the backport of the same patch for the gcc4.5 branch. Please > let me know when you want to commit this patch to the branch. For now > I sent this out for test on amd64-linux. This patch passed regression test. Sebastian > > Sebastian > > 2010-12-10 Sebastian Pop <sebastian.pop@amd.com> > > PR tree-optimization/43023 > * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): > Removed. > (stores_zero_from_loop): Call stmt_stores_zero. > (stmt_with_adjacent_zero_store_dr_p): New. > * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. > (stride_of_unit_type_p): New. > * tree-loop-distribution.c (generate_memset_zero): Do not return a > boolean. Call gcc_assert on stride_of_unit_type_p. > (generate_builtin): Call stmt_stores_zero. > (rdg_flag_all_uses): Removed. > (rdg_flag_similar_memory_accesses): Removed. > (build_rdg_partition_for_component): Removed parameter > other_stores. Removed call to rdg_flag_similar_memory_accesses. > (can_generate_builtin): New. > (similar_memory_accesses): New. > (fuse_partitions_with_similar_memory_accesses): New. > (rdg_build_partitions): Call > fuse_partitions_with_similar_memory_accesses. > > * gfortran.dg/ldist-1.f90: Adjust pattern. > * gfortran.dg/ldist-pr43023.f90: New. > --- > gcc/ChangeLog | 22 +++ > gcc/testsuite/ChangeLog | 6 + > gcc/testsuite/gfortran.dg/ldist-1.f90 | 5 +- > gcc/testsuite/gfortran.dg/ldist-pr43023.f90 | 31 ++++ > gcc/tree-data-ref.c | 34 ++++- > gcc/tree-data-ref.h | 12 ++ > gcc/tree-loop-distribution.c | 223 +++++++++++---------------- > 7 files changed, 201 insertions(+), 132 deletions(-) > create mode 100644 gcc/testsuite/gfortran.dg/ldist-pr43023.f90 > > diff --git a/gcc/ChangeLog b/gcc/ChangeLog > index abecb83..dfe45ed 100644 > --- a/gcc/ChangeLog > +++ b/gcc/ChangeLog > @@ -1,3 +1,25 @@ > +2010-12-10 Sebastian Pop <sebastian.pop@amd.com> > + > + PR tree-optimization/43023 > + * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): > + Removed. > + (stores_zero_from_loop): Call stmt_stores_zero. > + (stmt_with_adjacent_zero_store_dr_p): New. > + * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. > + (stride_of_unit_type_p): New. > + * tree-loop-distribution.c (generate_memset_zero): Do not return a > + boolean. Call gcc_assert on stride_of_unit_type_p. > + (generate_builtin): Call stmt_stores_zero. > + (rdg_flag_all_uses): Removed. > + (rdg_flag_similar_memory_accesses): Removed. > + (build_rdg_partition_for_component): Removed parameter > + other_stores. Removed call to rdg_flag_similar_memory_accesses. > + (can_generate_builtin): New. > + (similar_memory_accesses): New. > + (fuse_partitions_with_similar_memory_accesses): New. > + (rdg_build_partitions): Call > + fuse_partitions_with_similar_memory_accesses. > + > 2010-12-07 Jakub Jelinek <jakub@redhat.com> > > Backport from mainline > diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog > index e3a9d24..36fd59d 100644 > --- a/gcc/testsuite/ChangeLog > +++ b/gcc/testsuite/ChangeLog > @@ -1,3 +1,9 @@ > +2010-12-10 Sebastian Pop <sebastian.pop@amd.com> > + > + PR tree-optimization/43023 > + * gfortran.dg/ldist-1.f90: Adjust pattern. > + * gfortran.dg/ldist-pr43023.f90: New. > + > 2010-12-07 Sebastian Pop <sebastian.pop@amd.com> > > PR tree-optimization/44676 > diff --git a/gcc/testsuite/gfortran.dg/ldist-1.f90 b/gcc/testsuite/gfortran.dg/ldist-1.f90 > index dd1f02a..bbce2f3 100644 > --- a/gcc/testsuite/gfortran.dg/ldist-1.f90 > +++ b/gcc/testsuite/gfortran.dg/ldist-1.f90 > @@ -29,5 +29,8 @@ Subroutine PADEC(DKS,DKDS,HVAR,WM,WG,FN,NS,AN,BN,CN,IT) > return > end Subroutine PADEC > > -! { dg-final { scan-tree-dump-times "distributed: split to 4 loops" 1 "ldist" } } > +! There are 5 legal partitions in this code. Based on the data > +! locality heuristic, this loop should not be split. > + > +! { dg-final { scan-tree-dump-not "distributed: split to" "ldist" } } > ! { dg-final { cleanup-tree-dump "ldist" } } > diff --git a/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 > new file mode 100644 > index 0000000..3e2d04c > --- /dev/null > +++ b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 > @@ -0,0 +1,31 @@ > +! { dg-do compile } > +! { dg-options "-O2 -ftree-loop-distribution" } > + > +MODULE NFT_mod > + > +implicit none > +integer :: Nangle > +real:: Z0 > +real, dimension(:,:), allocatable :: Angle > +real, dimension(:), allocatable :: exth, ezth, hxth, hyth, hyphi > + > +CONTAINS > + > +SUBROUTINE NFT_Init() > + > +real :: th, fi > +integer :: n > + > +do n = 1,Nangle > + th = Angle(n,1) > + fi = Angle(n,2) > + > + exth(n) = cos(fi)*cos(th) > + ezth(n) = -sin(th) > + hxth(n) = -sin(fi) > + hyth(n) = cos(fi) > + hyphi(n) = -sin(fi) > +end do > +END SUBROUTINE NFT_Init > + > +END MODULE NFT_mod > diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c > index a89d151..dab0177 100644 > --- a/gcc/tree-data-ref.c > +++ b/gcc/tree-data-ref.c > @@ -4594,7 +4594,7 @@ dump_rdg_vertex (FILE *file, struct graph *rdg, int i) > for (e = v->succ; e; e = e->succ_next) > fprintf (file, " %d", e->dest); > > - fprintf (file, ") \n"); > + fprintf (file, ")\n"); > print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); > fprintf (file, ")\n"); > } > @@ -4991,6 +4991,38 @@ stores_from_loop (struct loop *loop, VEC (gimple, heap) **stmts) > free (bbs); > } > > +/* Returns true when the statement at STMT is of the form "A[i] = 0" > + that contains a data reference on its LHS with a stride of the same > + size as its unit type. */ > + > +bool > +stmt_with_adjacent_zero_store_dr_p (gimple stmt) > +{ > + tree op0, op1; > + bool res; > + struct data_reference *dr; > + > + if (!stmt > + || !gimple_vdef (stmt) > + || !is_gimple_assign (stmt) > + || !gimple_assign_single_p (stmt) > + || !(op1 = gimple_assign_rhs1 (stmt)) > + || !(integer_zerop (op1) || real_zerop (op1))) > + return false; > + > + dr = XCNEW (struct data_reference); > + op0 = gimple_assign_lhs (stmt); > + > + DR_STMT (dr) = stmt; > + DR_REF (dr) = op0; > + > + res = dr_analyze_innermost (dr) > + && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)); > + > + free_data_ref (dr); > + return res; > +} > + > /* For a data reference REF, return the declaration of its base > address or NULL_TREE if the base is not determined. */ > > diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h > index 678eb10..90cbca1 100644 > --- a/gcc/tree-data-ref.h > +++ b/gcc/tree-data-ref.h > @@ -567,6 +567,18 @@ void stores_from_loop (struct loop *, VEC (gimple, heap) **); > void remove_similar_memory_refs (VEC (gimple, heap) **); > bool rdg_defs_used_in_other_loops_p (struct graph *, int); > bool have_similar_memory_accesses (gimple, gimple); > +bool stmt_with_adjacent_zero_store_dr_p (gimple); > + > +/* Returns true when STRIDE is equal in absolute value to the size of > + the unit type of TYPE. */ > + > +static inline bool > +stride_of_unit_type_p (tree stride, tree type) > +{ > + return tree_int_cst_equal (fold_unary (ABS_EXPR, TREE_TYPE (stride), > + stride), > + TYPE_SIZE_UNIT (type)); > +} > > /* Determines whether RDG vertices V1 and V2 access to similar memory > locations, in which case they have to be in the same partition. */ > diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c > index a328860..2e420ea 100644 > --- a/gcc/tree-loop-distribution.c > +++ b/gcc/tree-loop-distribution.c > @@ -251,7 +251,7 @@ build_size_arg_loc (location_t loc, tree nb_iter, tree op, > > /* Generate a call to memset. Return true when the operation succeeded. */ > > -static bool > +static void > generate_memset_zero (gimple stmt, tree op0, tree nb_iter, > gimple_stmt_iterator bsi) > { > @@ -265,45 +265,27 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, > > DR_STMT (dr) = stmt; > DR_REF (dr) = op0; > - if (!dr_analyze_innermost (dr)) > - goto end; > + res = dr_analyze_innermost (dr); > + gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0))); > > - /* Test for a positive stride, iterating over every element. */ > - if (integer_zerop (size_binop (MINUS_EXPR, > - fold_convert (sizetype, DR_STEP (dr)), > - TYPE_SIZE_UNIT (TREE_TYPE (op0))))) > - { > - addr_base = fold_convert_loc (loc, sizetype, > - size_binop_loc (loc, PLUS_EXPR, > - DR_OFFSET (dr), > - DR_INIT (dr))); > - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, > - TREE_TYPE (DR_BASE_ADDRESS (dr)), > - DR_BASE_ADDRESS (dr), addr_base); > - > - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); > - } > + nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); > + addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); > + addr_base = fold_convert_loc (loc, sizetype, addr_base); > > /* Test for a negative stride, iterating over every element. */ > - else if (integer_zerop (size_binop (PLUS_EXPR, > - TYPE_SIZE_UNIT (TREE_TYPE (op0)), > - fold_convert (sizetype, DR_STEP (dr))))) > + if (integer_zerop (size_binop (PLUS_EXPR, > + TYPE_SIZE_UNIT (TREE_TYPE (op0)), > + fold_convert (sizetype, DR_STEP (dr))))) > { > - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); > - > - addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); > - addr_base = fold_convert_loc (loc, sizetype, addr_base); > addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base, > fold_convert_loc (loc, sizetype, nb_bytes)); > addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base, > TYPE_SIZE_UNIT (TREE_TYPE (op0))); > - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, > - TREE_TYPE (DR_BASE_ADDRESS (dr)), > - DR_BASE_ADDRESS (dr), addr_base); > } > - else > - goto end; > > + addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, > + TREE_TYPE (DR_BASE_ADDRESS (dr)), > + DR_BASE_ADDRESS (dr), addr_base); > mem = force_gimple_operand (addr_base, &stmts, true, NULL); > gimple_seq_add_seq (&stmt_list, stmts); > > @@ -311,14 +293,11 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, > fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes); > gimple_seq_add_stmt (&stmt_list, fn_call); > gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING); > - res = true; > > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "generated memset zero\n"); > > - end: > free_data_ref (dr); > - return res; > } > > /* Tries to generate a builtin function for the instructions of LOOP > @@ -332,7 +311,6 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) > unsigned i, x = 0; > basic_block *bbs; > gimple write = NULL; > - tree op0, op1; > gimple_stmt_iterator bsi; > tree nb_iter = number_of_exit_cond_executions (loop); > > @@ -368,26 +346,17 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) > } > } > > - if (!write) > - goto end; > - > - op0 = gimple_assign_lhs (write); > - op1 = gimple_assign_rhs1 (write); > - > - if (!(TREE_CODE (op0) == ARRAY_REF > - || TREE_CODE (op0) == INDIRECT_REF)) > + if (!stmt_with_adjacent_zero_store_dr_p (write)) > goto end; > > /* The new statements will be placed before LOOP. */ > bsi = gsi_last_bb (loop_preheader_edge (loop)->src); > - > - if (gimple_assign_rhs_code (write) == INTEGER_CST > - && (integer_zerop (op1) || real_zerop (op1))) > - res = generate_memset_zero (write, op0, nb_iter, bsi); > + generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi); > + res = true; > > /* If this is the last partition for which we generate code, we have > to destroy the loop. */ > - if (res && !copy_p) > + if (!copy_p) > { > unsigned nbbs = loop->num_nodes; > edge exit = single_exit (loop); > @@ -531,24 +500,6 @@ has_upstream_mem_writes (int u) > static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap, > bitmap, bool *); > > -/* Flag all the uses of U. */ > - > -static void > -rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops, > - bitmap processed, bool *part_has_writes) > -{ > - struct graph_edge *e; > - > - for (e = rdg->vertices[u].succ; e; e = e->succ_next) > - if (!bitmap_bit_p (processed, e->dest)) > - { > - rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops, > - processed, part_has_writes); > - rdg_flag_all_uses (rdg, e->dest, partition, loops, processed, > - part_has_writes); > - } > -} > - > /* Flag the uses of U stopping following the information from > upstream_mem_writes. */ > > @@ -720,68 +671,13 @@ rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition, > } > } > > -/* Flag all the nodes of RDG containing memory accesses that could > - potentially belong to arrays already accessed in the current > - PARTITION. */ > - > -static void > -rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition, > - bitmap loops, bitmap processed, > - VEC (int, heap) **other_stores) > -{ > - bool foo; > - unsigned i, n; > - int j, k, kk; > - bitmap_iterator ii; > - struct graph_edge *e; > - > - EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii) > - if (RDG_MEM_WRITE_STMT (rdg, i) > - || RDG_MEM_READS_STMT (rdg, i)) > - { > - for (j = 0; j < rdg->n_vertices; j++) > - if (!bitmap_bit_p (processed, j) > - && (RDG_MEM_WRITE_STMT (rdg, j) > - || RDG_MEM_READS_STMT (rdg, j)) > - && rdg_has_similar_memory_accesses (rdg, i, j)) > - { > - /* Flag first the node J itself, and all the nodes that > - are needed to compute J. */ > - rdg_flag_vertex_and_dependent (rdg, j, partition, loops, > - processed, &foo); > - > - /* When J is a read, we want to coalesce in the same > - PARTITION all the nodes that are using J: this is > - needed for better cache locality. */ > - rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo); > - > - /* Remove from OTHER_STORES the vertex that we flagged. */ > - if (RDG_MEM_WRITE_STMT (rdg, j)) > - for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++) > - if (kk == j) > - { > - VEC_unordered_remove (int, *other_stores, k); > - break; > - } > - } > - > - /* If the node I has two uses, then keep these together in the > - same PARTITION. */ > - for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++); > - > - if (n > 1) > - rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo); > - } > -} > - > /* Returns a bitmap in which all the statements needed for computing > the strongly connected component C of the RDG are flagged, also > including the loop exit conditions. */ > > static bitmap > build_rdg_partition_for_component (struct graph *rdg, rdgc c, > - bool *part_has_writes, > - VEC (int, heap) **other_stores) > + bool *part_has_writes) > { > int i, v; > bitmap partition = BITMAP_ALLOC (NULL); > @@ -793,13 +689,6 @@ build_rdg_partition_for_component (struct graph *rdg, rdgc c, > rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed, > part_has_writes); > > - /* Also iterate on the array of stores not in the starting vertices, > - and determine those vertices that have some memory affinity with > - the current nodes in the component: these are stores to the same > - arrays, i.e. we're taking care of cache locality. */ > - rdg_flag_similar_memory_accesses (rdg, partition, loops, processed, > - other_stores); > - > rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes); > > BITMAP_FREE (processed); > @@ -863,6 +752,79 @@ rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices, > BITMAP_FREE (saved_components); > } > > +/* Returns true when it is possible to generate a builtin pattern for > + the PARTITION of RDG. For the moment we detect only the memset > + zero pattern. */ > + > +static bool > +can_generate_builtin (struct graph *rdg, bitmap partition) > +{ > + unsigned i; > + bitmap_iterator bi; > + int nb_reads = 0; > + int nb_writes = 0; > + int stores_zero = 0; > + > + EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi) > + if (RDG_MEM_READS_STMT (rdg, i)) > + nb_reads++; > + else if (RDG_MEM_WRITE_STMT (rdg, i)) > + { > + nb_writes++; > + if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i))) > + stores_zero++; > + } > + > + return stores_zero == 1 && nb_writes == 1 && nb_reads == 0; > +} > + > +/* Returns true when PARTITION1 and PARTITION2 have similar memory > + accesses in RDG. */ > + > +static bool > +similar_memory_accesses (struct graph *rdg, bitmap partition1, > + bitmap partition2) > +{ > + unsigned i, j; > + bitmap_iterator bi, bj; > + > + EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi) > + if (RDG_MEM_WRITE_STMT (rdg, i) > + || RDG_MEM_READS_STMT (rdg, i)) > + EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj) > + if (RDG_MEM_WRITE_STMT (rdg, j) > + || RDG_MEM_READS_STMT (rdg, j)) > + if (rdg_has_similar_memory_accesses (rdg, i, j)) > + return true; > + > + return false; > +} > + > +/* Fuse all the partitions from PARTITIONS that contain similar memory > + references, i.e., we're taking care of cache locality. This > + function does not fuse those partitions that contain patterns that > + can be code generated with builtins. */ > + > +static void > +fuse_partitions_with_similar_memory_accesses (struct graph *rdg, > + VEC (bitmap, heap) **partitions) > +{ > + int p1, p2; > + bitmap partition1, partition2; > + > + for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++) > + if (!can_generate_builtin (rdg, partition1)) > + for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++) > + if (p1 != p2 > + && !can_generate_builtin (rdg, partition2) > + && similar_memory_accesses (rdg, partition1, partition2)) > + { > + bitmap_ior_into (partition1, partition2); > + VEC_ordered_remove (bitmap, *partitions, p2); > + p2--; > + } > +} > + > /* Aggregate several components into a useful partition that is > registered in the PARTITIONS vector. Partitions will be > distributed in different loops. */ > @@ -885,8 +847,7 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, > if (bitmap_bit_p (processed, v)) > continue; > > - np = build_rdg_partition_for_component (rdg, x, &part_has_writes, > - other_stores); > + np = build_rdg_partition_for_component (rdg, x, &part_has_writes); > bitmap_ior_into (partition, np); > bitmap_ior_into (processed, np); > BITMAP_FREE (np); > @@ -932,6 +893,8 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, > VEC_safe_push (bitmap, heap, *partitions, partition); > else > BITMAP_FREE (partition); > + > + fuse_partitions_with_similar_memory_accesses (rdg, partitions); > } > > /* Dump to FILE the PARTITIONS. */ > -- > 1.7.1 > >
On Mon, Dec 13, 2010 at 13:37, Sebastian Pop <sebpop@gmail.com> wrote: >> Hi, >> >> here is the backport of the same patch for the gcc4.5 branch. Please >> let me know when you want to commit this patch to the branch. For now >> I sent this out for test on amd64-linux. > > This patch passed regression test. > Ping. Ok for the 4.5 branch now that it is open? Thanks. > Sebastian > >> >> Sebastian >> >> 2010-12-10 Sebastian Pop <sebastian.pop@amd.com> >> >> PR tree-optimization/43023 >> * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): >> Removed. >> (stores_zero_from_loop): Call stmt_stores_zero. >> (stmt_with_adjacent_zero_store_dr_p): New. >> * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. >> (stride_of_unit_type_p): New. >> * tree-loop-distribution.c (generate_memset_zero): Do not return a >> boolean. Call gcc_assert on stride_of_unit_type_p. >> (generate_builtin): Call stmt_stores_zero. >> (rdg_flag_all_uses): Removed. >> (rdg_flag_similar_memory_accesses): Removed. >> (build_rdg_partition_for_component): Removed parameter >> other_stores. Removed call to rdg_flag_similar_memory_accesses. >> (can_generate_builtin): New. >> (similar_memory_accesses): New. >> (fuse_partitions_with_similar_memory_accesses): New. >> (rdg_build_partitions): Call >> fuse_partitions_with_similar_memory_accesses. >> >> * gfortran.dg/ldist-1.f90: Adjust pattern. >> * gfortran.dg/ldist-pr43023.f90: New. >> --- >> gcc/ChangeLog | 22 +++ >> gcc/testsuite/ChangeLog | 6 + >> gcc/testsuite/gfortran.dg/ldist-1.f90 | 5 +- >> gcc/testsuite/gfortran.dg/ldist-pr43023.f90 | 31 ++++ >> gcc/tree-data-ref.c | 34 ++++- >> gcc/tree-data-ref.h | 12 ++ >> gcc/tree-loop-distribution.c | 223 +++++++++++---------------- >> 7 files changed, 201 insertions(+), 132 deletions(-) >> create mode 100644 gcc/testsuite/gfortran.dg/ldist-pr43023.f90 >> >> diff --git a/gcc/ChangeLog b/gcc/ChangeLog >> index abecb83..dfe45ed 100644 >> --- a/gcc/ChangeLog >> +++ b/gcc/ChangeLog >> @@ -1,3 +1,25 @@ >> +2010-12-10 Sebastian Pop <sebastian.pop@amd.com> >> + >> + PR tree-optimization/43023 >> + * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): >> + Removed. >> + (stores_zero_from_loop): Call stmt_stores_zero. >> + (stmt_with_adjacent_zero_store_dr_p): New. >> + * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. >> + (stride_of_unit_type_p): New. >> + * tree-loop-distribution.c (generate_memset_zero): Do not return a >> + boolean. Call gcc_assert on stride_of_unit_type_p. >> + (generate_builtin): Call stmt_stores_zero. >> + (rdg_flag_all_uses): Removed. >> + (rdg_flag_similar_memory_accesses): Removed. >> + (build_rdg_partition_for_component): Removed parameter >> + other_stores. Removed call to rdg_flag_similar_memory_accesses. >> + (can_generate_builtin): New. >> + (similar_memory_accesses): New. >> + (fuse_partitions_with_similar_memory_accesses): New. >> + (rdg_build_partitions): Call >> + fuse_partitions_with_similar_memory_accesses. >> + >> 2010-12-07 Jakub Jelinek <jakub@redhat.com> >> >> Backport from mainline >> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog >> index e3a9d24..36fd59d 100644 >> --- a/gcc/testsuite/ChangeLog >> +++ b/gcc/testsuite/ChangeLog >> @@ -1,3 +1,9 @@ >> +2010-12-10 Sebastian Pop <sebastian.pop@amd.com> >> + >> + PR tree-optimization/43023 >> + * gfortran.dg/ldist-1.f90: Adjust pattern. >> + * gfortran.dg/ldist-pr43023.f90: New. >> + >> 2010-12-07 Sebastian Pop <sebastian.pop@amd.com> >> >> PR tree-optimization/44676 >> diff --git a/gcc/testsuite/gfortran.dg/ldist-1.f90 b/gcc/testsuite/gfortran.dg/ldist-1.f90 >> index dd1f02a..bbce2f3 100644 >> --- a/gcc/testsuite/gfortran.dg/ldist-1.f90 >> +++ b/gcc/testsuite/gfortran.dg/ldist-1.f90 >> @@ -29,5 +29,8 @@ Subroutine PADEC(DKS,DKDS,HVAR,WM,WG,FN,NS,AN,BN,CN,IT) >> return >> end Subroutine PADEC >> >> -! { dg-final { scan-tree-dump-times "distributed: split to 4 loops" 1 "ldist" } } >> +! There are 5 legal partitions in this code. Based on the data >> +! locality heuristic, this loop should not be split. >> + >> +! { dg-final { scan-tree-dump-not "distributed: split to" "ldist" } } >> ! { dg-final { cleanup-tree-dump "ldist" } } >> diff --git a/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 >> new file mode 100644 >> index 0000000..3e2d04c >> --- /dev/null >> +++ b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 >> @@ -0,0 +1,31 @@ >> +! { dg-do compile } >> +! { dg-options "-O2 -ftree-loop-distribution" } >> + >> +MODULE NFT_mod >> + >> +implicit none >> +integer :: Nangle >> +real:: Z0 >> +real, dimension(:,:), allocatable :: Angle >> +real, dimension(:), allocatable :: exth, ezth, hxth, hyth, hyphi >> + >> +CONTAINS >> + >> +SUBROUTINE NFT_Init() >> + >> +real :: th, fi >> +integer :: n >> + >> +do n = 1,Nangle >> + th = Angle(n,1) >> + fi = Angle(n,2) >> + >> + exth(n) = cos(fi)*cos(th) >> + ezth(n) = -sin(th) >> + hxth(n) = -sin(fi) >> + hyth(n) = cos(fi) >> + hyphi(n) = -sin(fi) >> +end do >> +END SUBROUTINE NFT_Init >> + >> +END MODULE NFT_mod >> diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c >> index a89d151..dab0177 100644 >> --- a/gcc/tree-data-ref.c >> +++ b/gcc/tree-data-ref.c >> @@ -4594,7 +4594,7 @@ dump_rdg_vertex (FILE *file, struct graph *rdg, int i) >> for (e = v->succ; e; e = e->succ_next) >> fprintf (file, " %d", e->dest); >> >> - fprintf (file, ") \n"); >> + fprintf (file, ")\n"); >> print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); >> fprintf (file, ")\n"); >> } >> @@ -4991,6 +4991,38 @@ stores_from_loop (struct loop *loop, VEC (gimple, heap) **stmts) >> free (bbs); >> } >> >> +/* Returns true when the statement at STMT is of the form "A[i] = 0" >> + that contains a data reference on its LHS with a stride of the same >> + size as its unit type. */ >> + >> +bool >> +stmt_with_adjacent_zero_store_dr_p (gimple stmt) >> +{ >> + tree op0, op1; >> + bool res; >> + struct data_reference *dr; >> + >> + if (!stmt >> + || !gimple_vdef (stmt) >> + || !is_gimple_assign (stmt) >> + || !gimple_assign_single_p (stmt) >> + || !(op1 = gimple_assign_rhs1 (stmt)) >> + || !(integer_zerop (op1) || real_zerop (op1))) >> + return false; >> + >> + dr = XCNEW (struct data_reference); >> + op0 = gimple_assign_lhs (stmt); >> + >> + DR_STMT (dr) = stmt; >> + DR_REF (dr) = op0; >> + >> + res = dr_analyze_innermost (dr) >> + && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)); >> + >> + free_data_ref (dr); >> + return res; >> +} >> + >> /* For a data reference REF, return the declaration of its base >> address or NULL_TREE if the base is not determined. */ >> >> diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h >> index 678eb10..90cbca1 100644 >> --- a/gcc/tree-data-ref.h >> +++ b/gcc/tree-data-ref.h >> @@ -567,6 +567,18 @@ void stores_from_loop (struct loop *, VEC (gimple, heap) **); >> void remove_similar_memory_refs (VEC (gimple, heap) **); >> bool rdg_defs_used_in_other_loops_p (struct graph *, int); >> bool have_similar_memory_accesses (gimple, gimple); >> +bool stmt_with_adjacent_zero_store_dr_p (gimple); >> + >> +/* Returns true when STRIDE is equal in absolute value to the size of >> + the unit type of TYPE. */ >> + >> +static inline bool >> +stride_of_unit_type_p (tree stride, tree type) >> +{ >> + return tree_int_cst_equal (fold_unary (ABS_EXPR, TREE_TYPE (stride), >> + stride), >> + TYPE_SIZE_UNIT (type)); >> +} >> >> /* Determines whether RDG vertices V1 and V2 access to similar memory >> locations, in which case they have to be in the same partition. */ >> diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c >> index a328860..2e420ea 100644 >> --- a/gcc/tree-loop-distribution.c >> +++ b/gcc/tree-loop-distribution.c >> @@ -251,7 +251,7 @@ build_size_arg_loc (location_t loc, tree nb_iter, tree op, >> >> /* Generate a call to memset. Return true when the operation succeeded. */ >> >> -static bool >> +static void >> generate_memset_zero (gimple stmt, tree op0, tree nb_iter, >> gimple_stmt_iterator bsi) >> { >> @@ -265,45 +265,27 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, >> >> DR_STMT (dr) = stmt; >> DR_REF (dr) = op0; >> - if (!dr_analyze_innermost (dr)) >> - goto end; >> + res = dr_analyze_innermost (dr); >> + gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0))); >> >> - /* Test for a positive stride, iterating over every element. */ >> - if (integer_zerop (size_binop (MINUS_EXPR, >> - fold_convert (sizetype, DR_STEP (dr)), >> - TYPE_SIZE_UNIT (TREE_TYPE (op0))))) >> - { >> - addr_base = fold_convert_loc (loc, sizetype, >> - size_binop_loc (loc, PLUS_EXPR, >> - DR_OFFSET (dr), >> - DR_INIT (dr))); >> - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, >> - TREE_TYPE (DR_BASE_ADDRESS (dr)), >> - DR_BASE_ADDRESS (dr), addr_base); >> - >> - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); >> - } >> + nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); >> + addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); >> + addr_base = fold_convert_loc (loc, sizetype, addr_base); >> >> /* Test for a negative stride, iterating over every element. */ >> - else if (integer_zerop (size_binop (PLUS_EXPR, >> - TYPE_SIZE_UNIT (TREE_TYPE (op0)), >> - fold_convert (sizetype, DR_STEP (dr))))) >> + if (integer_zerop (size_binop (PLUS_EXPR, >> + TYPE_SIZE_UNIT (TREE_TYPE (op0)), >> + fold_convert (sizetype, DR_STEP (dr))))) >> { >> - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); >> - >> - addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); >> - addr_base = fold_convert_loc (loc, sizetype, addr_base); >> addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base, >> fold_convert_loc (loc, sizetype, nb_bytes)); >> addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base, >> TYPE_SIZE_UNIT (TREE_TYPE (op0))); >> - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, >> - TREE_TYPE (DR_BASE_ADDRESS (dr)), >> - DR_BASE_ADDRESS (dr), addr_base); >> } >> - else >> - goto end; >> >> + addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, >> + TREE_TYPE (DR_BASE_ADDRESS (dr)), >> + DR_BASE_ADDRESS (dr), addr_base); >> mem = force_gimple_operand (addr_base, &stmts, true, NULL); >> gimple_seq_add_seq (&stmt_list, stmts); >> >> @@ -311,14 +293,11 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, >> fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes); >> gimple_seq_add_stmt (&stmt_list, fn_call); >> gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING); >> - res = true; >> >> if (dump_file && (dump_flags & TDF_DETAILS)) >> fprintf (dump_file, "generated memset zero\n"); >> >> - end: >> free_data_ref (dr); >> - return res; >> } >> >> /* Tries to generate a builtin function for the instructions of LOOP >> @@ -332,7 +311,6 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) >> unsigned i, x = 0; >> basic_block *bbs; >> gimple write = NULL; >> - tree op0, op1; >> gimple_stmt_iterator bsi; >> tree nb_iter = number_of_exit_cond_executions (loop); >> >> @@ -368,26 +346,17 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) >> } >> } >> >> - if (!write) >> - goto end; >> - >> - op0 = gimple_assign_lhs (write); >> - op1 = gimple_assign_rhs1 (write); >> - >> - if (!(TREE_CODE (op0) == ARRAY_REF >> - || TREE_CODE (op0) == INDIRECT_REF)) >> + if (!stmt_with_adjacent_zero_store_dr_p (write)) >> goto end; >> >> /* The new statements will be placed before LOOP. */ >> bsi = gsi_last_bb (loop_preheader_edge (loop)->src); >> - >> - if (gimple_assign_rhs_code (write) == INTEGER_CST >> - && (integer_zerop (op1) || real_zerop (op1))) >> - res = generate_memset_zero (write, op0, nb_iter, bsi); >> + generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi); >> + res = true; >> >> /* If this is the last partition for which we generate code, we have >> to destroy the loop. */ >> - if (res && !copy_p) >> + if (!copy_p) >> { >> unsigned nbbs = loop->num_nodes; >> edge exit = single_exit (loop); >> @@ -531,24 +500,6 @@ has_upstream_mem_writes (int u) >> static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap, >> bitmap, bool *); >> >> -/* Flag all the uses of U. */ >> - >> -static void >> -rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops, >> - bitmap processed, bool *part_has_writes) >> -{ >> - struct graph_edge *e; >> - >> - for (e = rdg->vertices[u].succ; e; e = e->succ_next) >> - if (!bitmap_bit_p (processed, e->dest)) >> - { >> - rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops, >> - processed, part_has_writes); >> - rdg_flag_all_uses (rdg, e->dest, partition, loops, processed, >> - part_has_writes); >> - } >> -} >> - >> /* Flag the uses of U stopping following the information from >> upstream_mem_writes. */ >> >> @@ -720,68 +671,13 @@ rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition, >> } >> } >> >> -/* Flag all the nodes of RDG containing memory accesses that could >> - potentially belong to arrays already accessed in the current >> - PARTITION. */ >> - >> -static void >> -rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition, >> - bitmap loops, bitmap processed, >> - VEC (int, heap) **other_stores) >> -{ >> - bool foo; >> - unsigned i, n; >> - int j, k, kk; >> - bitmap_iterator ii; >> - struct graph_edge *e; >> - >> - EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii) >> - if (RDG_MEM_WRITE_STMT (rdg, i) >> - || RDG_MEM_READS_STMT (rdg, i)) >> - { >> - for (j = 0; j < rdg->n_vertices; j++) >> - if (!bitmap_bit_p (processed, j) >> - && (RDG_MEM_WRITE_STMT (rdg, j) >> - || RDG_MEM_READS_STMT (rdg, j)) >> - && rdg_has_similar_memory_accesses (rdg, i, j)) >> - { >> - /* Flag first the node J itself, and all the nodes that >> - are needed to compute J. */ >> - rdg_flag_vertex_and_dependent (rdg, j, partition, loops, >> - processed, &foo); >> - >> - /* When J is a read, we want to coalesce in the same >> - PARTITION all the nodes that are using J: this is >> - needed for better cache locality. */ >> - rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo); >> - >> - /* Remove from OTHER_STORES the vertex that we flagged. */ >> - if (RDG_MEM_WRITE_STMT (rdg, j)) >> - for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++) >> - if (kk == j) >> - { >> - VEC_unordered_remove (int, *other_stores, k); >> - break; >> - } >> - } >> - >> - /* If the node I has two uses, then keep these together in the >> - same PARTITION. */ >> - for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++); >> - >> - if (n > 1) >> - rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo); >> - } >> -} >> - >> /* Returns a bitmap in which all the statements needed for computing >> the strongly connected component C of the RDG are flagged, also >> including the loop exit conditions. */ >> >> static bitmap >> build_rdg_partition_for_component (struct graph *rdg, rdgc c, >> - bool *part_has_writes, >> - VEC (int, heap) **other_stores) >> + bool *part_has_writes) >> { >> int i, v; >> bitmap partition = BITMAP_ALLOC (NULL); >> @@ -793,13 +689,6 @@ build_rdg_partition_for_component (struct graph *rdg, rdgc c, >> rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed, >> part_has_writes); >> >> - /* Also iterate on the array of stores not in the starting vertices, >> - and determine those vertices that have some memory affinity with >> - the current nodes in the component: these are stores to the same >> - arrays, i.e. we're taking care of cache locality. */ >> - rdg_flag_similar_memory_accesses (rdg, partition, loops, processed, >> - other_stores); >> - >> rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes); >> >> BITMAP_FREE (processed); >> @@ -863,6 +752,79 @@ rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices, >> BITMAP_FREE (saved_components); >> } >> >> +/* Returns true when it is possible to generate a builtin pattern for >> + the PARTITION of RDG. For the moment we detect only the memset >> + zero pattern. */ >> + >> +static bool >> +can_generate_builtin (struct graph *rdg, bitmap partition) >> +{ >> + unsigned i; >> + bitmap_iterator bi; >> + int nb_reads = 0; >> + int nb_writes = 0; >> + int stores_zero = 0; >> + >> + EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi) >> + if (RDG_MEM_READS_STMT (rdg, i)) >> + nb_reads++; >> + else if (RDG_MEM_WRITE_STMT (rdg, i)) >> + { >> + nb_writes++; >> + if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i))) >> + stores_zero++; >> + } >> + >> + return stores_zero == 1 && nb_writes == 1 && nb_reads == 0; >> +} >> + >> +/* Returns true when PARTITION1 and PARTITION2 have similar memory >> + accesses in RDG. */ >> + >> +static bool >> +similar_memory_accesses (struct graph *rdg, bitmap partition1, >> + bitmap partition2) >> +{ >> + unsigned i, j; >> + bitmap_iterator bi, bj; >> + >> + EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi) >> + if (RDG_MEM_WRITE_STMT (rdg, i) >> + || RDG_MEM_READS_STMT (rdg, i)) >> + EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj) >> + if (RDG_MEM_WRITE_STMT (rdg, j) >> + || RDG_MEM_READS_STMT (rdg, j)) >> + if (rdg_has_similar_memory_accesses (rdg, i, j)) >> + return true; >> + >> + return false; >> +} >> + >> +/* Fuse all the partitions from PARTITIONS that contain similar memory >> + references, i.e., we're taking care of cache locality. This >> + function does not fuse those partitions that contain patterns that >> + can be code generated with builtins. */ >> + >> +static void >> +fuse_partitions_with_similar_memory_accesses (struct graph *rdg, >> + VEC (bitmap, heap) **partitions) >> +{ >> + int p1, p2; >> + bitmap partition1, partition2; >> + >> + for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++) >> + if (!can_generate_builtin (rdg, partition1)) >> + for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++) >> + if (p1 != p2 >> + && !can_generate_builtin (rdg, partition2) >> + && similar_memory_accesses (rdg, partition1, partition2)) >> + { >> + bitmap_ior_into (partition1, partition2); >> + VEC_ordered_remove (bitmap, *partitions, p2); >> + p2--; >> + } >> +} >> + >> /* Aggregate several components into a useful partition that is >> registered in the PARTITIONS vector. Partitions will be >> distributed in different loops. */ >> @@ -885,8 +847,7 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, >> if (bitmap_bit_p (processed, v)) >> continue; >> >> - np = build_rdg_partition_for_component (rdg, x, &part_has_writes, >> - other_stores); >> + np = build_rdg_partition_for_component (rdg, x, &part_has_writes); >> bitmap_ior_into (partition, np); >> bitmap_ior_into (processed, np); >> BITMAP_FREE (np); >> @@ -932,6 +893,8 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, >> VEC_safe_push (bitmap, heap, *partitions, partition); >> else >> BITMAP_FREE (partition); >> + >> + fuse_partitions_with_similar_memory_accesses (rdg, partitions); >> } >> >> /* Dump to FILE the PARTITIONS. */ >> -- >> 1.7.1 >> >> >
On Thu, Dec 16, 2010 at 9:37 PM, Sebastian Pop <sebpop@gmail.com> wrote: > On Mon, Dec 13, 2010 at 13:37, Sebastian Pop <sebpop@gmail.com> wrote: >>> Hi, >>> >>> here is the backport of the same patch for the gcc4.5 branch. Please >>> let me know when you want to commit this patch to the branch. For now >>> I sent this out for test on amd64-linux. >> >> This patch passed regression test. >> > > Ping. Ok for the 4.5 branch now that it is open? The patch is a little big large, so please wait another week to see if HJ blames it for some new regression. Ok after that. Thanks, Richard. > Thanks. > >> Sebastian >> >>> >>> Sebastian >>> >>> 2010-12-10 Sebastian Pop <sebastian.pop@amd.com> >>> >>> PR tree-optimization/43023 >>> * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): >>> Removed. >>> (stores_zero_from_loop): Call stmt_stores_zero. >>> (stmt_with_adjacent_zero_store_dr_p): New. >>> * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. >>> (stride_of_unit_type_p): New. >>> * tree-loop-distribution.c (generate_memset_zero): Do not return a >>> boolean. Call gcc_assert on stride_of_unit_type_p. >>> (generate_builtin): Call stmt_stores_zero. >>> (rdg_flag_all_uses): Removed. >>> (rdg_flag_similar_memory_accesses): Removed. >>> (build_rdg_partition_for_component): Removed parameter >>> other_stores. Removed call to rdg_flag_similar_memory_accesses. >>> (can_generate_builtin): New. >>> (similar_memory_accesses): New. >>> (fuse_partitions_with_similar_memory_accesses): New. >>> (rdg_build_partitions): Call >>> fuse_partitions_with_similar_memory_accesses. >>> >>> * gfortran.dg/ldist-1.f90: Adjust pattern. >>> * gfortran.dg/ldist-pr43023.f90: New. >>> --- >>> gcc/ChangeLog | 22 +++ >>> gcc/testsuite/ChangeLog | 6 + >>> gcc/testsuite/gfortran.dg/ldist-1.f90 | 5 +- >>> gcc/testsuite/gfortran.dg/ldist-pr43023.f90 | 31 ++++ >>> gcc/tree-data-ref.c | 34 ++++- >>> gcc/tree-data-ref.h | 12 ++ >>> gcc/tree-loop-distribution.c | 223 +++++++++++---------------- >>> 7 files changed, 201 insertions(+), 132 deletions(-) >>> create mode 100644 gcc/testsuite/gfortran.dg/ldist-pr43023.f90 >>> >>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog >>> index abecb83..dfe45ed 100644 >>> --- a/gcc/ChangeLog >>> +++ b/gcc/ChangeLog >>> @@ -1,3 +1,25 @@ >>> +2010-12-10 Sebastian Pop <sebastian.pop@amd.com> >>> + >>> + PR tree-optimization/43023 >>> + * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): >>> + Removed. >>> + (stores_zero_from_loop): Call stmt_stores_zero. >>> + (stmt_with_adjacent_zero_store_dr_p): New. >>> + * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. >>> + (stride_of_unit_type_p): New. >>> + * tree-loop-distribution.c (generate_memset_zero): Do not return a >>> + boolean. Call gcc_assert on stride_of_unit_type_p. >>> + (generate_builtin): Call stmt_stores_zero. >>> + (rdg_flag_all_uses): Removed. >>> + (rdg_flag_similar_memory_accesses): Removed. >>> + (build_rdg_partition_for_component): Removed parameter >>> + other_stores. Removed call to rdg_flag_similar_memory_accesses. >>> + (can_generate_builtin): New. >>> + (similar_memory_accesses): New. >>> + (fuse_partitions_with_similar_memory_accesses): New. >>> + (rdg_build_partitions): Call >>> + fuse_partitions_with_similar_memory_accesses. >>> + >>> 2010-12-07 Jakub Jelinek <jakub@redhat.com> >>> >>> Backport from mainline >>> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog >>> index e3a9d24..36fd59d 100644 >>> --- a/gcc/testsuite/ChangeLog >>> +++ b/gcc/testsuite/ChangeLog >>> @@ -1,3 +1,9 @@ >>> +2010-12-10 Sebastian Pop <sebastian.pop@amd.com> >>> + >>> + PR tree-optimization/43023 >>> + * gfortran.dg/ldist-1.f90: Adjust pattern. >>> + * gfortran.dg/ldist-pr43023.f90: New. >>> + >>> 2010-12-07 Sebastian Pop <sebastian.pop@amd.com> >>> >>> PR tree-optimization/44676 >>> diff --git a/gcc/testsuite/gfortran.dg/ldist-1.f90 b/gcc/testsuite/gfortran.dg/ldist-1.f90 >>> index dd1f02a..bbce2f3 100644 >>> --- a/gcc/testsuite/gfortran.dg/ldist-1.f90 >>> +++ b/gcc/testsuite/gfortran.dg/ldist-1.f90 >>> @@ -29,5 +29,8 @@ Subroutine PADEC(DKS,DKDS,HVAR,WM,WG,FN,NS,AN,BN,CN,IT) >>> return >>> end Subroutine PADEC >>> >>> -! { dg-final { scan-tree-dump-times "distributed: split to 4 loops" 1 "ldist" } } >>> +! There are 5 legal partitions in this code. Based on the data >>> +! locality heuristic, this loop should not be split. >>> + >>> +! { dg-final { scan-tree-dump-not "distributed: split to" "ldist" } } >>> ! { dg-final { cleanup-tree-dump "ldist" } } >>> diff --git a/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 >>> new file mode 100644 >>> index 0000000..3e2d04c >>> --- /dev/null >>> +++ b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 >>> @@ -0,0 +1,31 @@ >>> +! { dg-do compile } >>> +! { dg-options "-O2 -ftree-loop-distribution" } >>> + >>> +MODULE NFT_mod >>> + >>> +implicit none >>> +integer :: Nangle >>> +real:: Z0 >>> +real, dimension(:,:), allocatable :: Angle >>> +real, dimension(:), allocatable :: exth, ezth, hxth, hyth, hyphi >>> + >>> +CONTAINS >>> + >>> +SUBROUTINE NFT_Init() >>> + >>> +real :: th, fi >>> +integer :: n >>> + >>> +do n = 1,Nangle >>> + th = Angle(n,1) >>> + fi = Angle(n,2) >>> + >>> + exth(n) = cos(fi)*cos(th) >>> + ezth(n) = -sin(th) >>> + hxth(n) = -sin(fi) >>> + hyth(n) = cos(fi) >>> + hyphi(n) = -sin(fi) >>> +end do >>> +END SUBROUTINE NFT_Init >>> + >>> +END MODULE NFT_mod >>> diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c >>> index a89d151..dab0177 100644 >>> --- a/gcc/tree-data-ref.c >>> +++ b/gcc/tree-data-ref.c >>> @@ -4594,7 +4594,7 @@ dump_rdg_vertex (FILE *file, struct graph *rdg, int i) >>> for (e = v->succ; e; e = e->succ_next) >>> fprintf (file, " %d", e->dest); >>> >>> - fprintf (file, ") \n"); >>> + fprintf (file, ")\n"); >>> print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); >>> fprintf (file, ")\n"); >>> } >>> @@ -4991,6 +4991,38 @@ stores_from_loop (struct loop *loop, VEC (gimple, heap) **stmts) >>> free (bbs); >>> } >>> >>> +/* Returns true when the statement at STMT is of the form "A[i] = 0" >>> + that contains a data reference on its LHS with a stride of the same >>> + size as its unit type. */ >>> + >>> +bool >>> +stmt_with_adjacent_zero_store_dr_p (gimple stmt) >>> +{ >>> + tree op0, op1; >>> + bool res; >>> + struct data_reference *dr; >>> + >>> + if (!stmt >>> + || !gimple_vdef (stmt) >>> + || !is_gimple_assign (stmt) >>> + || !gimple_assign_single_p (stmt) >>> + || !(op1 = gimple_assign_rhs1 (stmt)) >>> + || !(integer_zerop (op1) || real_zerop (op1))) >>> + return false; >>> + >>> + dr = XCNEW (struct data_reference); >>> + op0 = gimple_assign_lhs (stmt); >>> + >>> + DR_STMT (dr) = stmt; >>> + DR_REF (dr) = op0; >>> + >>> + res = dr_analyze_innermost (dr) >>> + && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)); >>> + >>> + free_data_ref (dr); >>> + return res; >>> +} >>> + >>> /* For a data reference REF, return the declaration of its base >>> address or NULL_TREE if the base is not determined. */ >>> >>> diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h >>> index 678eb10..90cbca1 100644 >>> --- a/gcc/tree-data-ref.h >>> +++ b/gcc/tree-data-ref.h >>> @@ -567,6 +567,18 @@ void stores_from_loop (struct loop *, VEC (gimple, heap) **); >>> void remove_similar_memory_refs (VEC (gimple, heap) **); >>> bool rdg_defs_used_in_other_loops_p (struct graph *, int); >>> bool have_similar_memory_accesses (gimple, gimple); >>> +bool stmt_with_adjacent_zero_store_dr_p (gimple); >>> + >>> +/* Returns true when STRIDE is equal in absolute value to the size of >>> + the unit type of TYPE. */ >>> + >>> +static inline bool >>> +stride_of_unit_type_p (tree stride, tree type) >>> +{ >>> + return tree_int_cst_equal (fold_unary (ABS_EXPR, TREE_TYPE (stride), >>> + stride), >>> + TYPE_SIZE_UNIT (type)); >>> +} >>> >>> /* Determines whether RDG vertices V1 and V2 access to similar memory >>> locations, in which case they have to be in the same partition. */ >>> diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c >>> index a328860..2e420ea 100644 >>> --- a/gcc/tree-loop-distribution.c >>> +++ b/gcc/tree-loop-distribution.c >>> @@ -251,7 +251,7 @@ build_size_arg_loc (location_t loc, tree nb_iter, tree op, >>> >>> /* Generate a call to memset. Return true when the operation succeeded. */ >>> >>> -static bool >>> +static void >>> generate_memset_zero (gimple stmt, tree op0, tree nb_iter, >>> gimple_stmt_iterator bsi) >>> { >>> @@ -265,45 +265,27 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, >>> >>> DR_STMT (dr) = stmt; >>> DR_REF (dr) = op0; >>> - if (!dr_analyze_innermost (dr)) >>> - goto end; >>> + res = dr_analyze_innermost (dr); >>> + gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0))); >>> >>> - /* Test for a positive stride, iterating over every element. */ >>> - if (integer_zerop (size_binop (MINUS_EXPR, >>> - fold_convert (sizetype, DR_STEP (dr)), >>> - TYPE_SIZE_UNIT (TREE_TYPE (op0))))) >>> - { >>> - addr_base = fold_convert_loc (loc, sizetype, >>> - size_binop_loc (loc, PLUS_EXPR, >>> - DR_OFFSET (dr), >>> - DR_INIT (dr))); >>> - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, >>> - TREE_TYPE (DR_BASE_ADDRESS (dr)), >>> - DR_BASE_ADDRESS (dr), addr_base); >>> - >>> - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); >>> - } >>> + nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); >>> + addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); >>> + addr_base = fold_convert_loc (loc, sizetype, addr_base); >>> >>> /* Test for a negative stride, iterating over every element. */ >>> - else if (integer_zerop (size_binop (PLUS_EXPR, >>> - TYPE_SIZE_UNIT (TREE_TYPE (op0)), >>> - fold_convert (sizetype, DR_STEP (dr))))) >>> + if (integer_zerop (size_binop (PLUS_EXPR, >>> + TYPE_SIZE_UNIT (TREE_TYPE (op0)), >>> + fold_convert (sizetype, DR_STEP (dr))))) >>> { >>> - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); >>> - >>> - addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); >>> - addr_base = fold_convert_loc (loc, sizetype, addr_base); >>> addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base, >>> fold_convert_loc (loc, sizetype, nb_bytes)); >>> addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base, >>> TYPE_SIZE_UNIT (TREE_TYPE (op0))); >>> - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, >>> - TREE_TYPE (DR_BASE_ADDRESS (dr)), >>> - DR_BASE_ADDRESS (dr), addr_base); >>> } >>> - else >>> - goto end; >>> >>> + addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, >>> + TREE_TYPE (DR_BASE_ADDRESS (dr)), >>> + DR_BASE_ADDRESS (dr), addr_base); >>> mem = force_gimple_operand (addr_base, &stmts, true, NULL); >>> gimple_seq_add_seq (&stmt_list, stmts); >>> >>> @@ -311,14 +293,11 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, >>> fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes); >>> gimple_seq_add_stmt (&stmt_list, fn_call); >>> gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING); >>> - res = true; >>> >>> if (dump_file && (dump_flags & TDF_DETAILS)) >>> fprintf (dump_file, "generated memset zero\n"); >>> >>> - end: >>> free_data_ref (dr); >>> - return res; >>> } >>> >>> /* Tries to generate a builtin function for the instructions of LOOP >>> @@ -332,7 +311,6 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) >>> unsigned i, x = 0; >>> basic_block *bbs; >>> gimple write = NULL; >>> - tree op0, op1; >>> gimple_stmt_iterator bsi; >>> tree nb_iter = number_of_exit_cond_executions (loop); >>> >>> @@ -368,26 +346,17 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) >>> } >>> } >>> >>> - if (!write) >>> - goto end; >>> - >>> - op0 = gimple_assign_lhs (write); >>> - op1 = gimple_assign_rhs1 (write); >>> - >>> - if (!(TREE_CODE (op0) == ARRAY_REF >>> - || TREE_CODE (op0) == INDIRECT_REF)) >>> + if (!stmt_with_adjacent_zero_store_dr_p (write)) >>> goto end; >>> >>> /* The new statements will be placed before LOOP. */ >>> bsi = gsi_last_bb (loop_preheader_edge (loop)->src); >>> - >>> - if (gimple_assign_rhs_code (write) == INTEGER_CST >>> - && (integer_zerop (op1) || real_zerop (op1))) >>> - res = generate_memset_zero (write, op0, nb_iter, bsi); >>> + generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi); >>> + res = true; >>> >>> /* If this is the last partition for which we generate code, we have >>> to destroy the loop. */ >>> - if (res && !copy_p) >>> + if (!copy_p) >>> { >>> unsigned nbbs = loop->num_nodes; >>> edge exit = single_exit (loop); >>> @@ -531,24 +500,6 @@ has_upstream_mem_writes (int u) >>> static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap, >>> bitmap, bool *); >>> >>> -/* Flag all the uses of U. */ >>> - >>> -static void >>> -rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops, >>> - bitmap processed, bool *part_has_writes) >>> -{ >>> - struct graph_edge *e; >>> - >>> - for (e = rdg->vertices[u].succ; e; e = e->succ_next) >>> - if (!bitmap_bit_p (processed, e->dest)) >>> - { >>> - rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops, >>> - processed, part_has_writes); >>> - rdg_flag_all_uses (rdg, e->dest, partition, loops, processed, >>> - part_has_writes); >>> - } >>> -} >>> - >>> /* Flag the uses of U stopping following the information from >>> upstream_mem_writes. */ >>> >>> @@ -720,68 +671,13 @@ rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition, >>> } >>> } >>> >>> -/* Flag all the nodes of RDG containing memory accesses that could >>> - potentially belong to arrays already accessed in the current >>> - PARTITION. */ >>> - >>> -static void >>> -rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition, >>> - bitmap loops, bitmap processed, >>> - VEC (int, heap) **other_stores) >>> -{ >>> - bool foo; >>> - unsigned i, n; >>> - int j, k, kk; >>> - bitmap_iterator ii; >>> - struct graph_edge *e; >>> - >>> - EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii) >>> - if (RDG_MEM_WRITE_STMT (rdg, i) >>> - || RDG_MEM_READS_STMT (rdg, i)) >>> - { >>> - for (j = 0; j < rdg->n_vertices; j++) >>> - if (!bitmap_bit_p (processed, j) >>> - && (RDG_MEM_WRITE_STMT (rdg, j) >>> - || RDG_MEM_READS_STMT (rdg, j)) >>> - && rdg_has_similar_memory_accesses (rdg, i, j)) >>> - { >>> - /* Flag first the node J itself, and all the nodes that >>> - are needed to compute J. */ >>> - rdg_flag_vertex_and_dependent (rdg, j, partition, loops, >>> - processed, &foo); >>> - >>> - /* When J is a read, we want to coalesce in the same >>> - PARTITION all the nodes that are using J: this is >>> - needed for better cache locality. */ >>> - rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo); >>> - >>> - /* Remove from OTHER_STORES the vertex that we flagged. */ >>> - if (RDG_MEM_WRITE_STMT (rdg, j)) >>> - for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++) >>> - if (kk == j) >>> - { >>> - VEC_unordered_remove (int, *other_stores, k); >>> - break; >>> - } >>> - } >>> - >>> - /* If the node I has two uses, then keep these together in the >>> - same PARTITION. */ >>> - for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++); >>> - >>> - if (n > 1) >>> - rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo); >>> - } >>> -} >>> - >>> /* Returns a bitmap in which all the statements needed for computing >>> the strongly connected component C of the RDG are flagged, also >>> including the loop exit conditions. */ >>> >>> static bitmap >>> build_rdg_partition_for_component (struct graph *rdg, rdgc c, >>> - bool *part_has_writes, >>> - VEC (int, heap) **other_stores) >>> + bool *part_has_writes) >>> { >>> int i, v; >>> bitmap partition = BITMAP_ALLOC (NULL); >>> @@ -793,13 +689,6 @@ build_rdg_partition_for_component (struct graph *rdg, rdgc c, >>> rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed, >>> part_has_writes); >>> >>> - /* Also iterate on the array of stores not in the starting vertices, >>> - and determine those vertices that have some memory affinity with >>> - the current nodes in the component: these are stores to the same >>> - arrays, i.e. we're taking care of cache locality. */ >>> - rdg_flag_similar_memory_accesses (rdg, partition, loops, processed, >>> - other_stores); >>> - >>> rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes); >>> >>> BITMAP_FREE (processed); >>> @@ -863,6 +752,79 @@ rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices, >>> BITMAP_FREE (saved_components); >>> } >>> >>> +/* Returns true when it is possible to generate a builtin pattern for >>> + the PARTITION of RDG. For the moment we detect only the memset >>> + zero pattern. */ >>> + >>> +static bool >>> +can_generate_builtin (struct graph *rdg, bitmap partition) >>> +{ >>> + unsigned i; >>> + bitmap_iterator bi; >>> + int nb_reads = 0; >>> + int nb_writes = 0; >>> + int stores_zero = 0; >>> + >>> + EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi) >>> + if (RDG_MEM_READS_STMT (rdg, i)) >>> + nb_reads++; >>> + else if (RDG_MEM_WRITE_STMT (rdg, i)) >>> + { >>> + nb_writes++; >>> + if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i))) >>> + stores_zero++; >>> + } >>> + >>> + return stores_zero == 1 && nb_writes == 1 && nb_reads == 0; >>> +} >>> + >>> +/* Returns true when PARTITION1 and PARTITION2 have similar memory >>> + accesses in RDG. */ >>> + >>> +static bool >>> +similar_memory_accesses (struct graph *rdg, bitmap partition1, >>> + bitmap partition2) >>> +{ >>> + unsigned i, j; >>> + bitmap_iterator bi, bj; >>> + >>> + EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi) >>> + if (RDG_MEM_WRITE_STMT (rdg, i) >>> + || RDG_MEM_READS_STMT (rdg, i)) >>> + EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj) >>> + if (RDG_MEM_WRITE_STMT (rdg, j) >>> + || RDG_MEM_READS_STMT (rdg, j)) >>> + if (rdg_has_similar_memory_accesses (rdg, i, j)) >>> + return true; >>> + >>> + return false; >>> +} >>> + >>> +/* Fuse all the partitions from PARTITIONS that contain similar memory >>> + references, i.e., we're taking care of cache locality. This >>> + function does not fuse those partitions that contain patterns that >>> + can be code generated with builtins. */ >>> + >>> +static void >>> +fuse_partitions_with_similar_memory_accesses (struct graph *rdg, >>> + VEC (bitmap, heap) **partitions) >>> +{ >>> + int p1, p2; >>> + bitmap partition1, partition2; >>> + >>> + for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++) >>> + if (!can_generate_builtin (rdg, partition1)) >>> + for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++) >>> + if (p1 != p2 >>> + && !can_generate_builtin (rdg, partition2) >>> + && similar_memory_accesses (rdg, partition1, partition2)) >>> + { >>> + bitmap_ior_into (partition1, partition2); >>> + VEC_ordered_remove (bitmap, *partitions, p2); >>> + p2--; >>> + } >>> +} >>> + >>> /* Aggregate several components into a useful partition that is >>> registered in the PARTITIONS vector. Partitions will be >>> distributed in different loops. */ >>> @@ -885,8 +847,7 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, >>> if (bitmap_bit_p (processed, v)) >>> continue; >>> >>> - np = build_rdg_partition_for_component (rdg, x, &part_has_writes, >>> - other_stores); >>> + np = build_rdg_partition_for_component (rdg, x, &part_has_writes); >>> bitmap_ior_into (partition, np); >>> bitmap_ior_into (processed, np); >>> BITMAP_FREE (np); >>> @@ -932,6 +893,8 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, >>> VEC_safe_push (bitmap, heap, *partitions, partition); >>> else >>> BITMAP_FREE (partition); >>> + >>> + fuse_partitions_with_similar_memory_accesses (rdg, partitions); >>> } >>> >>> /* Dump to FILE the PARTITIONS. */ >>> -- >>> 1.7.1 >>> >>> >> >
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index abecb83..dfe45ed 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,25 @@ +2010-12-10 Sebastian Pop <sebastian.pop@amd.com> + + PR tree-optimization/43023 + * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): + Removed. + (stores_zero_from_loop): Call stmt_stores_zero. + (stmt_with_adjacent_zero_store_dr_p): New. + * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. + (stride_of_unit_type_p): New. + * tree-loop-distribution.c (generate_memset_zero): Do not return a + boolean. Call gcc_assert on stride_of_unit_type_p. + (generate_builtin): Call stmt_stores_zero. + (rdg_flag_all_uses): Removed. + (rdg_flag_similar_memory_accesses): Removed. + (build_rdg_partition_for_component): Removed parameter + other_stores. Removed call to rdg_flag_similar_memory_accesses. + (can_generate_builtin): New. + (similar_memory_accesses): New. + (fuse_partitions_with_similar_memory_accesses): New. + (rdg_build_partitions): Call + fuse_partitions_with_similar_memory_accesses. + 2010-12-07 Jakub Jelinek <jakub@redhat.com> Backport from mainline diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e3a9d24..36fd59d 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2010-12-10 Sebastian Pop <sebastian.pop@amd.com> + + PR tree-optimization/43023 + * gfortran.dg/ldist-1.f90: Adjust pattern. + * gfortran.dg/ldist-pr43023.f90: New. + 2010-12-07 Sebastian Pop <sebastian.pop@amd.com> PR tree-optimization/44676 diff --git a/gcc/testsuite/gfortran.dg/ldist-1.f90 b/gcc/testsuite/gfortran.dg/ldist-1.f90 index dd1f02a..bbce2f3 100644 --- a/gcc/testsuite/gfortran.dg/ldist-1.f90 +++ b/gcc/testsuite/gfortran.dg/ldist-1.f90 @@ -29,5 +29,8 @@ Subroutine PADEC(DKS,DKDS,HVAR,WM,WG,FN,NS,AN,BN,CN,IT) return end Subroutine PADEC -! { dg-final { scan-tree-dump-times "distributed: split to 4 loops" 1 "ldist" } } +! There are 5 legal partitions in this code. Based on the data +! locality heuristic, this loop should not be split. + +! { dg-final { scan-tree-dump-not "distributed: split to" "ldist" } } ! { dg-final { cleanup-tree-dump "ldist" } } diff --git a/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 new file mode 100644 index 0000000..3e2d04c --- /dev/null +++ b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 @@ -0,0 +1,31 @@ +! { dg-do compile } +! { dg-options "-O2 -ftree-loop-distribution" } + +MODULE NFT_mod + +implicit none +integer :: Nangle +real:: Z0 +real, dimension(:,:), allocatable :: Angle +real, dimension(:), allocatable :: exth, ezth, hxth, hyth, hyphi + +CONTAINS + +SUBROUTINE NFT_Init() + +real :: th, fi +integer :: n + +do n = 1,Nangle + th = Angle(n,1) + fi = Angle(n,2) + + exth(n) = cos(fi)*cos(th) + ezth(n) = -sin(th) + hxth(n) = -sin(fi) + hyth(n) = cos(fi) + hyphi(n) = -sin(fi) +end do +END SUBROUTINE NFT_Init + +END MODULE NFT_mod diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index a89d151..dab0177 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -4594,7 +4594,7 @@ dump_rdg_vertex (FILE *file, struct graph *rdg, int i) for (e = v->succ; e; e = e->succ_next) fprintf (file, " %d", e->dest); - fprintf (file, ") \n"); + fprintf (file, ")\n"); print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); fprintf (file, ")\n"); } @@ -4991,6 +4991,38 @@ stores_from_loop (struct loop *loop, VEC (gimple, heap) **stmts) free (bbs); } +/* Returns true when the statement at STMT is of the form "A[i] = 0" + that contains a data reference on its LHS with a stride of the same + size as its unit type. */ + +bool +stmt_with_adjacent_zero_store_dr_p (gimple stmt) +{ + tree op0, op1; + bool res; + struct data_reference *dr; + + if (!stmt + || !gimple_vdef (stmt) + || !is_gimple_assign (stmt) + || !gimple_assign_single_p (stmt) + || !(op1 = gimple_assign_rhs1 (stmt)) + || !(integer_zerop (op1) || real_zerop (op1))) + return false; + + dr = XCNEW (struct data_reference); + op0 = gimple_assign_lhs (stmt); + + DR_STMT (dr) = stmt; + DR_REF (dr) = op0; + + res = dr_analyze_innermost (dr) + && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)); + + free_data_ref (dr); + return res; +} + /* For a data reference REF, return the declaration of its base address or NULL_TREE if the base is not determined. */ diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h index 678eb10..90cbca1 100644 --- a/gcc/tree-data-ref.h +++ b/gcc/tree-data-ref.h @@ -567,6 +567,18 @@ void stores_from_loop (struct loop *, VEC (gimple, heap) **); void remove_similar_memory_refs (VEC (gimple, heap) **); bool rdg_defs_used_in_other_loops_p (struct graph *, int); bool have_similar_memory_accesses (gimple, gimple); +bool stmt_with_adjacent_zero_store_dr_p (gimple); + +/* Returns true when STRIDE is equal in absolute value to the size of + the unit type of TYPE. */ + +static inline bool +stride_of_unit_type_p (tree stride, tree type) +{ + return tree_int_cst_equal (fold_unary (ABS_EXPR, TREE_TYPE (stride), + stride), + TYPE_SIZE_UNIT (type)); +} /* Determines whether RDG vertices V1 and V2 access to similar memory locations, in which case they have to be in the same partition. */ diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index a328860..2e420ea 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -251,7 +251,7 @@ build_size_arg_loc (location_t loc, tree nb_iter, tree op, /* Generate a call to memset. Return true when the operation succeeded. */ -static bool +static void generate_memset_zero (gimple stmt, tree op0, tree nb_iter, gimple_stmt_iterator bsi) { @@ -265,45 +265,27 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, DR_STMT (dr) = stmt; DR_REF (dr) = op0; - if (!dr_analyze_innermost (dr)) - goto end; + res = dr_analyze_innermost (dr); + gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0))); - /* Test for a positive stride, iterating over every element. */ - if (integer_zerop (size_binop (MINUS_EXPR, - fold_convert (sizetype, DR_STEP (dr)), - TYPE_SIZE_UNIT (TREE_TYPE (op0))))) - { - addr_base = fold_convert_loc (loc, sizetype, - size_binop_loc (loc, PLUS_EXPR, - DR_OFFSET (dr), - DR_INIT (dr))); - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, - TREE_TYPE (DR_BASE_ADDRESS (dr)), - DR_BASE_ADDRESS (dr), addr_base); - - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); - } + nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); + addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); + addr_base = fold_convert_loc (loc, sizetype, addr_base); /* Test for a negative stride, iterating over every element. */ - else if (integer_zerop (size_binop (PLUS_EXPR, - TYPE_SIZE_UNIT (TREE_TYPE (op0)), - fold_convert (sizetype, DR_STEP (dr))))) + if (integer_zerop (size_binop (PLUS_EXPR, + TYPE_SIZE_UNIT (TREE_TYPE (op0)), + fold_convert (sizetype, DR_STEP (dr))))) { - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); - - addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); - addr_base = fold_convert_loc (loc, sizetype, addr_base); addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base, fold_convert_loc (loc, sizetype, nb_bytes)); addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base, TYPE_SIZE_UNIT (TREE_TYPE (op0))); - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, - TREE_TYPE (DR_BASE_ADDRESS (dr)), - DR_BASE_ADDRESS (dr), addr_base); } - else - goto end; + addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, + TREE_TYPE (DR_BASE_ADDRESS (dr)), + DR_BASE_ADDRESS (dr), addr_base); mem = force_gimple_operand (addr_base, &stmts, true, NULL); gimple_seq_add_seq (&stmt_list, stmts); @@ -311,14 +293,11 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes); gimple_seq_add_stmt (&stmt_list, fn_call); gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING); - res = true; if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "generated memset zero\n"); - end: free_data_ref (dr); - return res; } /* Tries to generate a builtin function for the instructions of LOOP @@ -332,7 +311,6 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) unsigned i, x = 0; basic_block *bbs; gimple write = NULL; - tree op0, op1; gimple_stmt_iterator bsi; tree nb_iter = number_of_exit_cond_executions (loop); @@ -368,26 +346,17 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) } } - if (!write) - goto end; - - op0 = gimple_assign_lhs (write); - op1 = gimple_assign_rhs1 (write); - - if (!(TREE_CODE (op0) == ARRAY_REF - || TREE_CODE (op0) == INDIRECT_REF)) + if (!stmt_with_adjacent_zero_store_dr_p (write)) goto end; /* The new statements will be placed before LOOP. */ bsi = gsi_last_bb (loop_preheader_edge (loop)->src); - - if (gimple_assign_rhs_code (write) == INTEGER_CST - && (integer_zerop (op1) || real_zerop (op1))) - res = generate_memset_zero (write, op0, nb_iter, bsi); + generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi); + res = true; /* If this is the last partition for which we generate code, we have to destroy the loop. */ - if (res && !copy_p) + if (!copy_p) { unsigned nbbs = loop->num_nodes; edge exit = single_exit (loop); @@ -531,24 +500,6 @@ has_upstream_mem_writes (int u) static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap, bitmap, bool *); -/* Flag all the uses of U. */ - -static void -rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops, - bitmap processed, bool *part_has_writes) -{ - struct graph_edge *e; - - for (e = rdg->vertices[u].succ; e; e = e->succ_next) - if (!bitmap_bit_p (processed, e->dest)) - { - rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops, - processed, part_has_writes); - rdg_flag_all_uses (rdg, e->dest, partition, loops, processed, - part_has_writes); - } -} - /* Flag the uses of U stopping following the information from upstream_mem_writes. */ @@ -720,68 +671,13 @@ rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition, } } -/* Flag all the nodes of RDG containing memory accesses that could - potentially belong to arrays already accessed in the current - PARTITION. */ - -static void -rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition, - bitmap loops, bitmap processed, - VEC (int, heap) **other_stores) -{ - bool foo; - unsigned i, n; - int j, k, kk; - bitmap_iterator ii; - struct graph_edge *e; - - EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii) - if (RDG_MEM_WRITE_STMT (rdg, i) - || RDG_MEM_READS_STMT (rdg, i)) - { - for (j = 0; j < rdg->n_vertices; j++) - if (!bitmap_bit_p (processed, j) - && (RDG_MEM_WRITE_STMT (rdg, j) - || RDG_MEM_READS_STMT (rdg, j)) - && rdg_has_similar_memory_accesses (rdg, i, j)) - { - /* Flag first the node J itself, and all the nodes that - are needed to compute J. */ - rdg_flag_vertex_and_dependent (rdg, j, partition, loops, - processed, &foo); - - /* When J is a read, we want to coalesce in the same - PARTITION all the nodes that are using J: this is - needed for better cache locality. */ - rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo); - - /* Remove from OTHER_STORES the vertex that we flagged. */ - if (RDG_MEM_WRITE_STMT (rdg, j)) - for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++) - if (kk == j) - { - VEC_unordered_remove (int, *other_stores, k); - break; - } - } - - /* If the node I has two uses, then keep these together in the - same PARTITION. */ - for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++); - - if (n > 1) - rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo); - } -} - /* Returns a bitmap in which all the statements needed for computing the strongly connected component C of the RDG are flagged, also including the loop exit conditions. */ static bitmap build_rdg_partition_for_component (struct graph *rdg, rdgc c, - bool *part_has_writes, - VEC (int, heap) **other_stores) + bool *part_has_writes) { int i, v; bitmap partition = BITMAP_ALLOC (NULL); @@ -793,13 +689,6 @@ build_rdg_partition_for_component (struct graph *rdg, rdgc c, rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed, part_has_writes); - /* Also iterate on the array of stores not in the starting vertices, - and determine those vertices that have some memory affinity with - the current nodes in the component: these are stores to the same - arrays, i.e. we're taking care of cache locality. */ - rdg_flag_similar_memory_accesses (rdg, partition, loops, processed, - other_stores); - rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes); BITMAP_FREE (processed); @@ -863,6 +752,79 @@ rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices, BITMAP_FREE (saved_components); } +/* Returns true when it is possible to generate a builtin pattern for + the PARTITION of RDG. For the moment we detect only the memset + zero pattern. */ + +static bool +can_generate_builtin (struct graph *rdg, bitmap partition) +{ + unsigned i; + bitmap_iterator bi; + int nb_reads = 0; + int nb_writes = 0; + int stores_zero = 0; + + EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi) + if (RDG_MEM_READS_STMT (rdg, i)) + nb_reads++; + else if (RDG_MEM_WRITE_STMT (rdg, i)) + { + nb_writes++; + if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i))) + stores_zero++; + } + + return stores_zero == 1 && nb_writes == 1 && nb_reads == 0; +} + +/* Returns true when PARTITION1 and PARTITION2 have similar memory + accesses in RDG. */ + +static bool +similar_memory_accesses (struct graph *rdg, bitmap partition1, + bitmap partition2) +{ + unsigned i, j; + bitmap_iterator bi, bj; + + EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi) + if (RDG_MEM_WRITE_STMT (rdg, i) + || RDG_MEM_READS_STMT (rdg, i)) + EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj) + if (RDG_MEM_WRITE_STMT (rdg, j) + || RDG_MEM_READS_STMT (rdg, j)) + if (rdg_has_similar_memory_accesses (rdg, i, j)) + return true; + + return false; +} + +/* Fuse all the partitions from PARTITIONS that contain similar memory + references, i.e., we're taking care of cache locality. This + function does not fuse those partitions that contain patterns that + can be code generated with builtins. */ + +static void +fuse_partitions_with_similar_memory_accesses (struct graph *rdg, + VEC (bitmap, heap) **partitions) +{ + int p1, p2; + bitmap partition1, partition2; + + for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++) + if (!can_generate_builtin (rdg, partition1)) + for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++) + if (p1 != p2 + && !can_generate_builtin (rdg, partition2) + && similar_memory_accesses (rdg, partition1, partition2)) + { + bitmap_ior_into (partition1, partition2); + VEC_ordered_remove (bitmap, *partitions, p2); + p2--; + } +} + /* Aggregate several components into a useful partition that is registered in the PARTITIONS vector. Partitions will be distributed in different loops. */ @@ -885,8 +847,7 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, if (bitmap_bit_p (processed, v)) continue; - np = build_rdg_partition_for_component (rdg, x, &part_has_writes, - other_stores); + np = build_rdg_partition_for_component (rdg, x, &part_has_writes); bitmap_ior_into (partition, np); bitmap_ior_into (processed, np); BITMAP_FREE (np); @@ -932,6 +893,8 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, VEC_safe_push (bitmap, heap, *partitions, partition); else BITMAP_FREE (partition); + + fuse_partitions_with_similar_memory_accesses (rdg, partitions); } /* Dump to FILE the PARTITIONS. */