diff mbox

[8/8] Do simple omp lowering for no address taken var

Message ID 54678C29.40006@mentor.com
State New
Headers show

Commit Message

Tom de Vries Nov. 15, 2014, 5:23 p.m. UTC
On 15-11-14 13:14, Tom de Vries wrote:
> Hi,
>
> I'm submitting a patch series with initial support for the oacc kernels directive.
>
> The patch series uses pass_parallelize_loops to implement parallelization of
> loops in the oacc kernels region.
>
> The patch series consists of these 8 patches:
> ...
>      1  Expand oacc kernels after pass_build_ealias
>      2  Add pass_oacc_kernels
>      3  Add pass_ch_oacc_kernels to pass_oacc_kernels
>      4  Add pass_tree_loop_{init,done} to pass_oacc_kernels
>      5  Add pass_loop_im to pass_oacc_kernels
>      6  Add pass_ccp to pass_oacc_kernels
>      7  Add pass_parloops_oacc_kernels to pass_oacc_kernels
>      8  Do simple omp lowering for no address taken var
> ...

This patch lowers integer variables that do not have their address taken as 
local variable.  We use a copy at region entry and exit to copy the value in and 
out.

In the context of reduction handling in a kernels region, this allows the 
parloops reduction analysis to recognize the reduction, even after oacc lowering 
has been done in pass_lower_omp.

In more detail, without this patch, the omp_data_i load and stores are generated 
in place (in this case, in the loop):
...
                 {
                   .omp_data_iD.2201 = &.omp_data_arr.15D.2220;
                   {
                     unsigned intD.9 iD.2146;

                     iD.2146 = 0;
                     goto <D.2207>;
                     <D.2208>:
                     D.2216 = .omp_data_iD.2201->cD.2203;
                     c.9D.2176 = *D.2216;
                     D.2177 = (long unsigned intD.10) iD.2146;
                     D.2178 = D.2177 * 4;
                     D.2179 = c.9D.2176 + D.2178;
                     D.2180 = *D.2179;
                     D.2217 = .omp_data_iD.2201->sumD.2205;
                     D.2218 = *D.2217;
                     D.2217 = .omp_data_iD.2201->sumD.2205;
                     D.2219 = D.2180 + D.2218;
                     *D.2217 = D.2219;
                     iD.2146 = iD.2146 + 1;
                     <D.2207>:
                     if (iD.2146 <= 524287) goto <D.2208>; else goto <D.2209>;
                     <D.2209>:
                   }
...

With this patch, the omp_data_i load and stores for sum are generated at entry 
and exit:
...
                 {
                   .omp_data_iD.2201 = &.omp_data_arr.15D.2218;
                   D.2216 = .omp_data_iD.2201->sumD.2205;
                   sumD.2206 = *D.2216;
                   {
                     unsigned intD.9 iD.2146;

                     iD.2146 = 0;
                     goto <D.2207>;
                     <D.2208>:
                     D.2217 = .omp_data_iD.2201->cD.2203;
                     c.9D.2176 = *D.2217;
                     D.2177 = (long unsigned intD.10) iD.2146;
                     D.2178 = D.2177 * 4;
                     D.2179 = c.9D.2176 + D.2178;
                     D.2180 = *D.2179;
                     sumD.2206 = D.2180 + sumD.2206;
                     iD.2146 = iD.2146 + 1;
                     <D.2207>:
                     if (iD.2146 <= 524287) goto <D.2208>; else goto <D.2209>;
                     <D.2209>:
                   }
                   *D.2216 = sumD.2206;
                   #pragma omp return
                 }
...


So, without the patch the reduction operation looks like this:
...
     *(.omp_data_iD.2201->sumD.2205) = *(.omp_data_iD.2201->sumD.2205) + x
...

And with this patch the reduction operation is simply:
...
     sumD.2206 = sumD.2206 + x:
...

OK for trunk?

Thanks,
- Tom

Comments

Richard Biener Nov. 17, 2014, 10:13 a.m. UTC | #1
On Sat, 15 Nov 2014, Tom de Vries wrote:

> On 15-11-14 13:14, Tom de Vries wrote:
> > Hi,
> > 
> > I'm submitting a patch series with initial support for the oacc kernels
> > directive.
> > 
> > The patch series uses pass_parallelize_loops to implement parallelization of
> > loops in the oacc kernels region.
> > 
> > The patch series consists of these 8 patches:
> > ...
> >      1  Expand oacc kernels after pass_build_ealias
> >      2  Add pass_oacc_kernels
> >      3  Add pass_ch_oacc_kernels to pass_oacc_kernels
> >      4  Add pass_tree_loop_{init,done} to pass_oacc_kernels
> >      5  Add pass_loop_im to pass_oacc_kernels
> >      6  Add pass_ccp to pass_oacc_kernels
> >      7  Add pass_parloops_oacc_kernels to pass_oacc_kernels
> >      8  Do simple omp lowering for no address taken var
> > ...
> 
> This patch lowers integer variables that do not have their address taken as
> local variable.  We use a copy at region entry and exit to copy the value in
> and out.
> 
> In the context of reduction handling in a kernels region, this allows the
> parloops reduction analysis to recognize the reduction, even after oacc
> lowering has been done in pass_lower_omp.
> 
> In more detail, without this patch, the omp_data_i load and stores are
> generated in place (in this case, in the loop):
> ...
>                 {
>                   .omp_data_iD.2201 = &.omp_data_arr.15D.2220;
>                   {
>                     unsigned intD.9 iD.2146;
> 
>                     iD.2146 = 0;
>                     goto <D.2207>;
>                     <D.2208>:
>                     D.2216 = .omp_data_iD.2201->cD.2203;
>                     c.9D.2176 = *D.2216;
>                     D.2177 = (long unsigned intD.10) iD.2146;
>                     D.2178 = D.2177 * 4;
>                     D.2179 = c.9D.2176 + D.2178;
>                     D.2180 = *D.2179;
>                     D.2217 = .omp_data_iD.2201->sumD.2205;
>                     D.2218 = *D.2217;
>                     D.2217 = .omp_data_iD.2201->sumD.2205;
>                     D.2219 = D.2180 + D.2218;
>                     *D.2217 = D.2219;
>                     iD.2146 = iD.2146 + 1;
>                     <D.2207>:
>                     if (iD.2146 <= 524287) goto <D.2208>; else goto <D.2209>;
>                     <D.2209>:
>                   }
> ...
> 
> With this patch, the omp_data_i load and stores for sum are generated at entry
> and exit:
> ...
>                 {
>                   .omp_data_iD.2201 = &.omp_data_arr.15D.2218;
>                   D.2216 = .omp_data_iD.2201->sumD.2205;
>                   sumD.2206 = *D.2216;
>                   {
>                     unsigned intD.9 iD.2146;
> 
>                     iD.2146 = 0;
>                     goto <D.2207>;
>                     <D.2208>:
>                     D.2217 = .omp_data_iD.2201->cD.2203;
>                     c.9D.2176 = *D.2217;
>                     D.2177 = (long unsigned intD.10) iD.2146;
>                     D.2178 = D.2177 * 4;
>                     D.2179 = c.9D.2176 + D.2178;
>                     D.2180 = *D.2179;
>                     sumD.2206 = D.2180 + sumD.2206;
>                     iD.2146 = iD.2146 + 1;
>                     <D.2207>:
>                     if (iD.2146 <= 524287) goto <D.2208>; else goto <D.2209>;
>                     <D.2209>:
>                   }
>                   *D.2216 = sumD.2206;
>                   #pragma omp return
>                 }
> ...
> 
> 
> So, without the patch the reduction operation looks like this:
> ...
>     *(.omp_data_iD.2201->sumD.2205) = *(.omp_data_iD.2201->sumD.2205) + x
> ...
> 
> And with this patch the reduction operation is simply:
> ...
>     sumD.2206 = sumD.2206 + x:
> ...
> 
> OK for trunk?

I presume the reason you are trying to do that here is that otherwise
it happens too late?  What you do is what loop store motion would
do.

Now - I can see how that is easily confused by the static chain
being address-taken.  But I also remember that Eric did some
preparatory work to fix that, for nested functions, that is,
possibly setting DECL_NONADDRESSABLE_P?  Don't remember exactly.

That said - the gimple_seq_ior_addresses_taken_op callback looks
completely broken.  Consider &a.x which you'd fail to mark as
address-taken.  It looks like the body is not yet in CFG form
when you apply all this?

That said - the functions do not belong to gimple.[ch] at least
as they are not going to work in general.  I also question
why they are necessary - you do

+           if (gimple_code (stmt) == GIMPLE_OACC_KERNELS
+               && !bitmap_bit_p (addresses_taken, DECL_UID (var))
+               && INTEGRAL_TYPE_P (TREE_TYPE (var)))

but why don't you simply check TREE_ADDRESSABLE (var)?  TREE_ADDRESSABLE
is conservative correct here.

And the above won't help for float reductions.  So if, then you
should probably test is_gimple_reg_type (TREE_TYPE (var)) instead
of INTEGRAL_TYPE_P and you definitely should limit the number of
vars treated this way.

Oh - and the optimization should be somewhere more general - after
all it applies to all nested functions (thus move it to tree-nested.c?)
and to autopar loops as well.  Not sure how much code the omp
lowering shares with unnesting - but hopefully enough.

Richard.
Eric Botcazou Nov. 18, 2014, 9:03 a.m. UTC | #2
> Now - I can see how that is easily confused by the static chain
> being address-taken.  But I also remember that Eric did some
> preparatory work to fix that, for nested functions, that is,
> possibly setting DECL_NONADDRESSABLE_P?  Don't remember exactly.

The preparatory work is DECL_NONLOCAL_FRAME.  The complete patch which does 
something along these lines is attached to PR tree-optimization/54779 (latest 
version, for a 4.9-based compiler).
Richard Biener Nov. 18, 2014, 9:36 a.m. UTC | #3
On Tue, 18 Nov 2014, Eric Botcazou wrote:

> > Now - I can see how that is easily confused by the static chain
> > being address-taken.  But I also remember that Eric did some
> > preparatory work to fix that, for nested functions, that is,
> > possibly setting DECL_NONADDRESSABLE_P?  Don't remember exactly.
> 
> The preparatory work is DECL_NONLOCAL_FRAME.  The complete patch which does 
> something along these lines is attached to PR tree-optimization/54779 (latest 
> version, for a 4.9-based compiler).

Ah, now I remember - this was to be able to optimize away the frame
variable in case the nested function was inlined.

Toms case is somewhat different as I undestand as somehow LIM store
motion doesn't handle indirect frame accesses well enough(?)  So
he intends to load register vars in the frame into registers at the
beginning of the nested function and restore them to the frame on
function exit (this will probably break for recursive calls, but
OMP offloading might be special enough that this is a non-issue there).

So marking the frame decl won't help him here (I thought we might
mark the FIELD_DECLs corresponding to individual vars).  OTOH inside
the nested function accesses to the static chain should be easy to
identify.

Richard.
diff mbox

Patch

2014-11-03  Tom de Vries  <tom@codesourcery.com>

	* gimple.c (gimple_seq_ior_addresses_taken_op)
	(gimple_seq_ior_addresses_taken): New function.
	* gimple.h (gimple_seq_ior_addresses_taken): Declare.
	* omp-low.c (addresses_taken): Declare local variable.
	(lower_oacc_offload): Lower variables that do not have their address
	taken as local variable.  Use a copy at region entry and exit to copy
	the value in and out.
	(execute_lower_omp): Calculate addresses_taken.
---
 gcc/gimple.c  | 35 +++++++++++++++++++++++++++++++++++
 gcc/gimple.h  |  1 +
 gcc/omp-low.c | 25 ++++++++++++++++++++++---
 3 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/gcc/gimple.c b/gcc/gimple.c
index a9174e6..107eb26 100644
--- a/gcc/gimple.c
+++ b/gcc/gimple.c
@@ -2428,6 +2428,41 @@  gimple_ior_addresses_taken (bitmap addresses_taken, gimple stmt)
 					gimple_ior_addresses_taken_1);
 }
 
+/* Helper function for gimple_seq_ior_addresses_taken.  */
+
+static tree
+gimple_seq_ior_addresses_taken_op (tree *tp,
+				   int *walk_subtrees ATTRIBUTE_UNUSED,
+				   void *data)
+{
+  struct walk_stmt_info *wi = (struct walk_stmt_info *)data;
+  bitmap addresses_taken = (bitmap)wi->info;
+
+  tree t = *tp;
+  if (TREE_CODE (t) != ADDR_EXPR)
+    return NULL_TREE;
+
+  tree var = TREE_OPERAND (t, 0);
+  if (!DECL_P (var))
+    return NULL_TREE;
+
+  bitmap_set_bit (addresses_taken, DECL_UID (var));
+
+  return NULL_TREE;
+}
+
+/* Find the decls in SEQ that have their address taken, and set the
+   corresponding decl_uid in ADDRESSES_TAKEN.  */
+
+void
+gimple_seq_ior_addresses_taken (gimple_seq seq, bitmap addresses_taken)
+{
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  wi.info = addresses_taken;
+
+  walk_gimple_seq (seq, NULL, gimple_seq_ior_addresses_taken_op, &wi);
+}
 
 /* Return true if TYPE1 and TYPE2 are compatible enough for builtin
    processing.  */
diff --git a/gcc/gimple.h b/gcc/gimple.h
index 4faeaaa..528a9df 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -1316,6 +1316,7 @@  extern tree gimple_unsigned_type (tree);
 extern tree gimple_signed_type (tree);
 extern alias_set_type gimple_get_alias_set (tree);
 extern bool gimple_ior_addresses_taken (bitmap, gimple);
+extern void gimple_seq_ior_addresses_taken (gimple_seq, bitmap);
 extern bool gimple_builtin_call_types_compatible_p (const_gimple, tree);
 extern bool gimple_call_builtin_p (const_gimple);
 extern bool gimple_call_builtin_p (const_gimple, enum built_in_class);
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index e35fa8b..ff78b04 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -229,6 +229,7 @@  static int target_nesting_level;
 static struct omp_region *root_omp_region;
 static bitmap task_shared_vars;
 static vec<omp_context *> taskreg_contexts;
+static bitmap addresses_taken;
 
 static void scan_omp (gimple_seq *, omp_context *);
 static tree scan_omp_1_op (tree *, int *, void *);
@@ -11307,7 +11308,7 @@  lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   tree child_fn, t, c;
   gimple stmt = gsi_stmt (*gsi_p);
   gimple tgt_bind, bind;
-  gimple_seq tgt_body, olist, ilist, orlist, irlist, new_body;
+  gimple_seq tgt_body, olist, ilist, orlist, irlist, olist2, ilist2, new_body;
   location_t loc = gimple_location (stmt);
   bool offloaded, data_region;
   unsigned int map_cnt = 0;
@@ -11368,6 +11369,8 @@  lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 
   irlist = NULL;
   orlist = NULL;
+  ilist2 = NULL;
+  olist2 = NULL;
   switch (gimple_code (stmt))
     {
     case GIMPLE_OACC_KERNELS:
@@ -11451,8 +11454,18 @@  lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 		&& !OMP_CLAUSE_MAP_ZERO_BIAS_ARRAY_SECTION (c)
 		&& TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE)
 	      x = build_simple_mem_ref (x);
-	    SET_DECL_VALUE_EXPR (new_var, x);
-	    DECL_HAS_VALUE_EXPR_P (new_var) = 1;
+	    if (gimple_code (stmt) == GIMPLE_OACC_KERNELS
+		&& !bitmap_bit_p (addresses_taken, DECL_UID (var))
+		&& INTEGRAL_TYPE_P (TREE_TYPE (var)))
+	      {
+		gimplify_assign (new_var, x, &ilist2);
+		gimplify_assign (unshare_expr (x), new_var, &olist2);
+	      }
+	    else
+	      {
+		SET_DECL_VALUE_EXPR (new_var, x);
+		DECL_HAS_VALUE_EXPR_P (new_var) = 1;
+	      }
 	  }
 	map_cnt++;
       }
@@ -11719,7 +11732,9 @@  lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 
   if (offloaded)
     {
+      gimple_seq_add_seq (&new_body, ilist2);
       gimple_seq_add_seq (&new_body, tgt_body);
+      gimple_seq_add_seq (&new_body, olist2);
       new_body = maybe_catch_exception (new_body);
     }
   else if (data_region)
@@ -12054,6 +12069,9 @@  execute_lower_omp (void)
       && flag_cilkplus == 0)
     return 0;
 
+  addresses_taken = BITMAP_ALLOC (NULL);
+  gimple_seq_ior_addresses_taken (gimple_body (cfun->decl), addresses_taken);
+
   all_contexts = splay_tree_new (splay_tree_compare_pointers, 0,
 				 delete_omp_context);
 
@@ -12079,6 +12097,7 @@  execute_lower_omp (void)
       all_contexts = NULL;
     }
   BITMAP_FREE (task_shared_vars);
+  BITMAP_FREE (addresses_taken);
   return 0;
 }
 
-- 
1.9.1