From patchwork Fri Dec 10 19:45:31 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Sebastian Pop X-Patchwork-Id: 75129 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id 11D93B70A9 for ; Sat, 11 Dec 2010 06:45:51 +1100 (EST) Received: (qmail 10703 invoked by alias); 10 Dec 2010 19:45:49 -0000 Received: (qmail 10683 invoked by uid 22791); 10 Dec 2010 19:45:46 -0000 X-SWARE-Spam-Status: No, hits=-2.2 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, TW_TM, T_TO_NO_BRKTS_FREEMAIL X-Spam-Check-By: sourceware.org Received: from mail-iw0-f178.google.com (HELO mail-iw0-f178.google.com) (209.85.214.178) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Fri, 10 Dec 2010 19:45:38 +0000 Received: by iwn1 with SMTP id 1so6139668iwn.37 for ; Fri, 10 Dec 2010 11:45:37 -0800 (PST) Received: by 10.231.205.132 with SMTP id fq4mr859714ibb.17.1292010336976; Fri, 10 Dec 2010 11:45:36 -0800 (PST) Received: from napoca ([163.181.251.115]) by mx.google.com with ESMTPS id z4sm2903816ibg.13.2010.12.10.11.45.34 (version=TLSv1/SSLv3 cipher=RC4-MD5); Fri, 10 Dec 2010 11:45:35 -0800 (PST) Received: by napoca (sSMTP sendmail emulation); Fri, 10 Dec 2010 13:45:32 -0600 From: Sebastian Pop To: gcc-patches@gcc.gnu.org Cc: rguenther@suse.de, Sebastian Pop Subject: [PATCH] Fix PR43023: fuse_partitions_with_similar_memory_accesses. Date: Fri, 10 Dec 2010 13:45:31 -0600 Message-Id: <1292010331-20221-1-git-send-email-sebpop@gmail.com> In-Reply-To: References: X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Hi, here is the backport of the same patch for the gcc4.5 branch. Please let me know when you want to commit this patch to the branch. For now I sent this out for test on amd64-linux. Sebastian 2010-12-10 Sebastian Pop PR tree-optimization/43023 * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): Removed. (stores_zero_from_loop): Call stmt_stores_zero. (stmt_with_adjacent_zero_store_dr_p): New. * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. (stride_of_unit_type_p): New. * tree-loop-distribution.c (generate_memset_zero): Do not return a boolean. Call gcc_assert on stride_of_unit_type_p. (generate_builtin): Call stmt_stores_zero. (rdg_flag_all_uses): Removed. (rdg_flag_similar_memory_accesses): Removed. (build_rdg_partition_for_component): Removed parameter other_stores. Removed call to rdg_flag_similar_memory_accesses. (can_generate_builtin): New. (similar_memory_accesses): New. (fuse_partitions_with_similar_memory_accesses): New. (rdg_build_partitions): Call fuse_partitions_with_similar_memory_accesses. * gfortran.dg/ldist-1.f90: Adjust pattern. * gfortran.dg/ldist-pr43023.f90: New. --- gcc/ChangeLog | 22 +++ gcc/testsuite/ChangeLog | 6 + gcc/testsuite/gfortran.dg/ldist-1.f90 | 5 +- gcc/testsuite/gfortran.dg/ldist-pr43023.f90 | 31 ++++ gcc/tree-data-ref.c | 34 ++++- gcc/tree-data-ref.h | 12 ++ gcc/tree-loop-distribution.c | 223 +++++++++++---------------- 7 files changed, 201 insertions(+), 132 deletions(-) create mode 100644 gcc/testsuite/gfortran.dg/ldist-pr43023.f90 diff --git a/gcc/ChangeLog b/gcc/ChangeLog index abecb83..dfe45ed 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,25 @@ +2010-12-10 Sebastian Pop + + PR tree-optimization/43023 + * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p): + Removed. + (stores_zero_from_loop): Call stmt_stores_zero. + (stmt_with_adjacent_zero_store_dr_p): New. + * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared. + (stride_of_unit_type_p): New. + * tree-loop-distribution.c (generate_memset_zero): Do not return a + boolean. Call gcc_assert on stride_of_unit_type_p. + (generate_builtin): Call stmt_stores_zero. + (rdg_flag_all_uses): Removed. + (rdg_flag_similar_memory_accesses): Removed. + (build_rdg_partition_for_component): Removed parameter + other_stores. Removed call to rdg_flag_similar_memory_accesses. + (can_generate_builtin): New. + (similar_memory_accesses): New. + (fuse_partitions_with_similar_memory_accesses): New. + (rdg_build_partitions): Call + fuse_partitions_with_similar_memory_accesses. + 2010-12-07 Jakub Jelinek Backport from mainline diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e3a9d24..36fd59d 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2010-12-10 Sebastian Pop + + PR tree-optimization/43023 + * gfortran.dg/ldist-1.f90: Adjust pattern. + * gfortran.dg/ldist-pr43023.f90: New. + 2010-12-07 Sebastian Pop PR tree-optimization/44676 diff --git a/gcc/testsuite/gfortran.dg/ldist-1.f90 b/gcc/testsuite/gfortran.dg/ldist-1.f90 index dd1f02a..bbce2f3 100644 --- a/gcc/testsuite/gfortran.dg/ldist-1.f90 +++ b/gcc/testsuite/gfortran.dg/ldist-1.f90 @@ -29,5 +29,8 @@ Subroutine PADEC(DKS,DKDS,HVAR,WM,WG,FN,NS,AN,BN,CN,IT) return end Subroutine PADEC -! { dg-final { scan-tree-dump-times "distributed: split to 4 loops" 1 "ldist" } } +! There are 5 legal partitions in this code. Based on the data +! locality heuristic, this loop should not be split. + +! { dg-final { scan-tree-dump-not "distributed: split to" "ldist" } } ! { dg-final { cleanup-tree-dump "ldist" } } diff --git a/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 new file mode 100644 index 0000000..3e2d04c --- /dev/null +++ b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 @@ -0,0 +1,31 @@ +! { dg-do compile } +! { dg-options "-O2 -ftree-loop-distribution" } + +MODULE NFT_mod + +implicit none +integer :: Nangle +real:: Z0 +real, dimension(:,:), allocatable :: Angle +real, dimension(:), allocatable :: exth, ezth, hxth, hyth, hyphi + +CONTAINS + +SUBROUTINE NFT_Init() + +real :: th, fi +integer :: n + +do n = 1,Nangle + th = Angle(n,1) + fi = Angle(n,2) + + exth(n) = cos(fi)*cos(th) + ezth(n) = -sin(th) + hxth(n) = -sin(fi) + hyth(n) = cos(fi) + hyphi(n) = -sin(fi) +end do +END SUBROUTINE NFT_Init + +END MODULE NFT_mod diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index a89d151..dab0177 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -4594,7 +4594,7 @@ dump_rdg_vertex (FILE *file, struct graph *rdg, int i) for (e = v->succ; e; e = e->succ_next) fprintf (file, " %d", e->dest); - fprintf (file, ") \n"); + fprintf (file, ")\n"); print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); fprintf (file, ")\n"); } @@ -4991,6 +4991,38 @@ stores_from_loop (struct loop *loop, VEC (gimple, heap) **stmts) free (bbs); } +/* Returns true when the statement at STMT is of the form "A[i] = 0" + that contains a data reference on its LHS with a stride of the same + size as its unit type. */ + +bool +stmt_with_adjacent_zero_store_dr_p (gimple stmt) +{ + tree op0, op1; + bool res; + struct data_reference *dr; + + if (!stmt + || !gimple_vdef (stmt) + || !is_gimple_assign (stmt) + || !gimple_assign_single_p (stmt) + || !(op1 = gimple_assign_rhs1 (stmt)) + || !(integer_zerop (op1) || real_zerop (op1))) + return false; + + dr = XCNEW (struct data_reference); + op0 = gimple_assign_lhs (stmt); + + DR_STMT (dr) = stmt; + DR_REF (dr) = op0; + + res = dr_analyze_innermost (dr) + && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)); + + free_data_ref (dr); + return res; +} + /* For a data reference REF, return the declaration of its base address or NULL_TREE if the base is not determined. */ diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h index 678eb10..90cbca1 100644 --- a/gcc/tree-data-ref.h +++ b/gcc/tree-data-ref.h @@ -567,6 +567,18 @@ void stores_from_loop (struct loop *, VEC (gimple, heap) **); void remove_similar_memory_refs (VEC (gimple, heap) **); bool rdg_defs_used_in_other_loops_p (struct graph *, int); bool have_similar_memory_accesses (gimple, gimple); +bool stmt_with_adjacent_zero_store_dr_p (gimple); + +/* Returns true when STRIDE is equal in absolute value to the size of + the unit type of TYPE. */ + +static inline bool +stride_of_unit_type_p (tree stride, tree type) +{ + return tree_int_cst_equal (fold_unary (ABS_EXPR, TREE_TYPE (stride), + stride), + TYPE_SIZE_UNIT (type)); +} /* Determines whether RDG vertices V1 and V2 access to similar memory locations, in which case they have to be in the same partition. */ diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index a328860..2e420ea 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -251,7 +251,7 @@ build_size_arg_loc (location_t loc, tree nb_iter, tree op, /* Generate a call to memset. Return true when the operation succeeded. */ -static bool +static void generate_memset_zero (gimple stmt, tree op0, tree nb_iter, gimple_stmt_iterator bsi) { @@ -265,45 +265,27 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, DR_STMT (dr) = stmt; DR_REF (dr) = op0; - if (!dr_analyze_innermost (dr)) - goto end; + res = dr_analyze_innermost (dr); + gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0))); - /* Test for a positive stride, iterating over every element. */ - if (integer_zerop (size_binop (MINUS_EXPR, - fold_convert (sizetype, DR_STEP (dr)), - TYPE_SIZE_UNIT (TREE_TYPE (op0))))) - { - addr_base = fold_convert_loc (loc, sizetype, - size_binop_loc (loc, PLUS_EXPR, - DR_OFFSET (dr), - DR_INIT (dr))); - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, - TREE_TYPE (DR_BASE_ADDRESS (dr)), - DR_BASE_ADDRESS (dr), addr_base); - - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); - } + nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); + addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); + addr_base = fold_convert_loc (loc, sizetype, addr_base); /* Test for a negative stride, iterating over every element. */ - else if (integer_zerop (size_binop (PLUS_EXPR, - TYPE_SIZE_UNIT (TREE_TYPE (op0)), - fold_convert (sizetype, DR_STEP (dr))))) + if (integer_zerop (size_binop (PLUS_EXPR, + TYPE_SIZE_UNIT (TREE_TYPE (op0)), + fold_convert (sizetype, DR_STEP (dr))))) { - nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); - - addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); - addr_base = fold_convert_loc (loc, sizetype, addr_base); addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base, fold_convert_loc (loc, sizetype, nb_bytes)); addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base, TYPE_SIZE_UNIT (TREE_TYPE (op0))); - addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, - TREE_TYPE (DR_BASE_ADDRESS (dr)), - DR_BASE_ADDRESS (dr), addr_base); } - else - goto end; + addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, + TREE_TYPE (DR_BASE_ADDRESS (dr)), + DR_BASE_ADDRESS (dr), addr_base); mem = force_gimple_operand (addr_base, &stmts, true, NULL); gimple_seq_add_seq (&stmt_list, stmts); @@ -311,14 +293,11 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter, fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes); gimple_seq_add_stmt (&stmt_list, fn_call); gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING); - res = true; if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "generated memset zero\n"); - end: free_data_ref (dr); - return res; } /* Tries to generate a builtin function for the instructions of LOOP @@ -332,7 +311,6 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) unsigned i, x = 0; basic_block *bbs; gimple write = NULL; - tree op0, op1; gimple_stmt_iterator bsi; tree nb_iter = number_of_exit_cond_executions (loop); @@ -368,26 +346,17 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p) } } - if (!write) - goto end; - - op0 = gimple_assign_lhs (write); - op1 = gimple_assign_rhs1 (write); - - if (!(TREE_CODE (op0) == ARRAY_REF - || TREE_CODE (op0) == INDIRECT_REF)) + if (!stmt_with_adjacent_zero_store_dr_p (write)) goto end; /* The new statements will be placed before LOOP. */ bsi = gsi_last_bb (loop_preheader_edge (loop)->src); - - if (gimple_assign_rhs_code (write) == INTEGER_CST - && (integer_zerop (op1) || real_zerop (op1))) - res = generate_memset_zero (write, op0, nb_iter, bsi); + generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi); + res = true; /* If this is the last partition for which we generate code, we have to destroy the loop. */ - if (res && !copy_p) + if (!copy_p) { unsigned nbbs = loop->num_nodes; edge exit = single_exit (loop); @@ -531,24 +500,6 @@ has_upstream_mem_writes (int u) static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap, bitmap, bool *); -/* Flag all the uses of U. */ - -static void -rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops, - bitmap processed, bool *part_has_writes) -{ - struct graph_edge *e; - - for (e = rdg->vertices[u].succ; e; e = e->succ_next) - if (!bitmap_bit_p (processed, e->dest)) - { - rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops, - processed, part_has_writes); - rdg_flag_all_uses (rdg, e->dest, partition, loops, processed, - part_has_writes); - } -} - /* Flag the uses of U stopping following the information from upstream_mem_writes. */ @@ -720,68 +671,13 @@ rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition, } } -/* Flag all the nodes of RDG containing memory accesses that could - potentially belong to arrays already accessed in the current - PARTITION. */ - -static void -rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition, - bitmap loops, bitmap processed, - VEC (int, heap) **other_stores) -{ - bool foo; - unsigned i, n; - int j, k, kk; - bitmap_iterator ii; - struct graph_edge *e; - - EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii) - if (RDG_MEM_WRITE_STMT (rdg, i) - || RDG_MEM_READS_STMT (rdg, i)) - { - for (j = 0; j < rdg->n_vertices; j++) - if (!bitmap_bit_p (processed, j) - && (RDG_MEM_WRITE_STMT (rdg, j) - || RDG_MEM_READS_STMT (rdg, j)) - && rdg_has_similar_memory_accesses (rdg, i, j)) - { - /* Flag first the node J itself, and all the nodes that - are needed to compute J. */ - rdg_flag_vertex_and_dependent (rdg, j, partition, loops, - processed, &foo); - - /* When J is a read, we want to coalesce in the same - PARTITION all the nodes that are using J: this is - needed for better cache locality. */ - rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo); - - /* Remove from OTHER_STORES the vertex that we flagged. */ - if (RDG_MEM_WRITE_STMT (rdg, j)) - for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++) - if (kk == j) - { - VEC_unordered_remove (int, *other_stores, k); - break; - } - } - - /* If the node I has two uses, then keep these together in the - same PARTITION. */ - for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++); - - if (n > 1) - rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo); - } -} - /* Returns a bitmap in which all the statements needed for computing the strongly connected component C of the RDG are flagged, also including the loop exit conditions. */ static bitmap build_rdg_partition_for_component (struct graph *rdg, rdgc c, - bool *part_has_writes, - VEC (int, heap) **other_stores) + bool *part_has_writes) { int i, v; bitmap partition = BITMAP_ALLOC (NULL); @@ -793,13 +689,6 @@ build_rdg_partition_for_component (struct graph *rdg, rdgc c, rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed, part_has_writes); - /* Also iterate on the array of stores not in the starting vertices, - and determine those vertices that have some memory affinity with - the current nodes in the component: these are stores to the same - arrays, i.e. we're taking care of cache locality. */ - rdg_flag_similar_memory_accesses (rdg, partition, loops, processed, - other_stores); - rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes); BITMAP_FREE (processed); @@ -863,6 +752,79 @@ rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices, BITMAP_FREE (saved_components); } +/* Returns true when it is possible to generate a builtin pattern for + the PARTITION of RDG. For the moment we detect only the memset + zero pattern. */ + +static bool +can_generate_builtin (struct graph *rdg, bitmap partition) +{ + unsigned i; + bitmap_iterator bi; + int nb_reads = 0; + int nb_writes = 0; + int stores_zero = 0; + + EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi) + if (RDG_MEM_READS_STMT (rdg, i)) + nb_reads++; + else if (RDG_MEM_WRITE_STMT (rdg, i)) + { + nb_writes++; + if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i))) + stores_zero++; + } + + return stores_zero == 1 && nb_writes == 1 && nb_reads == 0; +} + +/* Returns true when PARTITION1 and PARTITION2 have similar memory + accesses in RDG. */ + +static bool +similar_memory_accesses (struct graph *rdg, bitmap partition1, + bitmap partition2) +{ + unsigned i, j; + bitmap_iterator bi, bj; + + EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi) + if (RDG_MEM_WRITE_STMT (rdg, i) + || RDG_MEM_READS_STMT (rdg, i)) + EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj) + if (RDG_MEM_WRITE_STMT (rdg, j) + || RDG_MEM_READS_STMT (rdg, j)) + if (rdg_has_similar_memory_accesses (rdg, i, j)) + return true; + + return false; +} + +/* Fuse all the partitions from PARTITIONS that contain similar memory + references, i.e., we're taking care of cache locality. This + function does not fuse those partitions that contain patterns that + can be code generated with builtins. */ + +static void +fuse_partitions_with_similar_memory_accesses (struct graph *rdg, + VEC (bitmap, heap) **partitions) +{ + int p1, p2; + bitmap partition1, partition2; + + for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++) + if (!can_generate_builtin (rdg, partition1)) + for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++) + if (p1 != p2 + && !can_generate_builtin (rdg, partition2) + && similar_memory_accesses (rdg, partition1, partition2)) + { + bitmap_ior_into (partition1, partition2); + VEC_ordered_remove (bitmap, *partitions, p2); + p2--; + } +} + /* Aggregate several components into a useful partition that is registered in the PARTITIONS vector. Partitions will be distributed in different loops. */ @@ -885,8 +847,7 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, if (bitmap_bit_p (processed, v)) continue; - np = build_rdg_partition_for_component (rdg, x, &part_has_writes, - other_stores); + np = build_rdg_partition_for_component (rdg, x, &part_has_writes); bitmap_ior_into (partition, np); bitmap_ior_into (processed, np); BITMAP_FREE (np); @@ -932,6 +893,8 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, VEC_safe_push (bitmap, heap, *partitions, partition); else BITMAP_FREE (partition); + + fuse_partitions_with_similar_memory_accesses (rdg, partitions); } /* Dump to FILE the PARTITIONS. */