@@ -50,6 +50,7 @@
#include "varasm.h"
#include "intl.h"
#include "rtl-iter.h"
+#include "gimple.h"
/* This file should be included last. */
#include "target-def.h"
@@ -4898,9 +4899,15 @@ gcn_oacc_dim_pos (int dim)
/* Implement TARGET_GOACC_FORK_JOIN. */
static bool
-gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
+gcn_fork_join (gcall *call, const int *ARG_UNUSED (dims),
bool ARG_UNUSED (is_fork))
{
+ tree arg = gimple_call_arg (call, 2);
+ unsigned axis = TREE_INT_CST_LOW (arg);
+
+ if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
+ return true;
+
return false;
}
@@ -768,16 +768,19 @@ static void
find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
hash_set<tree> *partitioned_var_uses,
hash_set<tree> *gangprivate_vars,
+ bitmap writes_gangprivate,
vec<propagation_set *> *prop_set)
{
unsigned mask = outer_mask | par->mask;
if (par->inner)
find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
- gangprivate_vars, prop_set);
+ gangprivate_vars, writes_gangprivate,
+ prop_set);
if (par->next)
find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
- gangprivate_vars, prop_set);
+ gangprivate_vars, writes_gangprivate,
+ prop_set);
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
{
@@ -798,8 +801,7 @@ find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
if (!VAR_P (var)
|| is_global_var (var)
|| AGGREGATE_TYPE_P (TREE_TYPE (var))
- || !partitioned_var_uses->contains (var)
- || gangprivate_vars->contains (var))
+ || !partitioned_var_uses->contains (var))
continue;
if (stmt_may_clobber_ref_p (stmt, var))
@@ -813,6 +815,14 @@ find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
fprintf (dump_file, "\n");
}
+ if (gangprivate_vars->contains (var))
+ {
+ /* If we write a gang-private variable, we want a
+ barrier at the end of the block. */
+ bitmap_set_bit (writes_gangprivate, block->index);
+ continue;
+ }
+
if (!(*prop_set)[block->index])
(*prop_set)[block->index] = new propagation_set;
@@ -924,14 +934,6 @@ worker_single_simple (basic_block from, basic_block to,
}
}
}
-
- gsi = gsi_start_bb (skip_block);
-
- decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
- gimple *acc_bar = gimple_build_call (decl, 0);
-
- gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
- update_stmt (acc_bar);
}
/* This is a copied and renamed omp-low.c:omp_build_component_ref. */
@@ -1009,7 +1011,7 @@ worker_single_copy (basic_block from, basic_block to,
hash_set<tree> *def_escapes_block,
hash_set<tree> *worker_partitioned_uses,
tree record_type, unsigned HOST_WIDE_INT placement,
- bool isolate_broadcasts)
+ bool isolate_broadcasts, bool has_gangprivate_write)
{
/* If we only have virtual defs, we'll have no record type, but we still want
to emit single_copy_start and (particularly) single_copy_end to act as
@@ -1090,14 +1092,19 @@ worker_single_copy (basic_block from, basic_block to,
edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
ef->probability = et->probability.invert ();
- decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
- gimple *acc_bar = gimple_build_call (decl, 0);
-
gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
- gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
-
cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
- gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
+
+ if (record_type != char_type_node || has_gangprivate_write)
+ {
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+ gimple *acc_bar = gimple_build_call (decl, 0);
+
+ gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
+ gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
+ }
+ else
+ gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT);
edge et2 = split_block (barrier_block, cond);
et2->flags &= ~EDGE_FALLTHRU;
@@ -1259,7 +1266,8 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
bitmap worker_single, bitmap vector_single,
vec<propagation_set *> *prop_set,
hash_set<tree> *partitioned_var_uses,
- blk_offset_map_t *blk_offset_map)
+ blk_offset_map_t *blk_offset_map,
+ bitmap writes_gangprivate)
{
unsigned mask = outer_mask | par->mask;
@@ -1345,19 +1353,69 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
(*prop_set)[block->index] = 0;
}
- tree record_type = (tree) block->aux;
+ bool only_marker_fns = true;
+ bool join_block = false;
+
+ for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ if (gimple_code (stmt) == GIMPLE_CALL
+ && gimple_call_internal_p (stmt, IFN_UNIQUE))
+ {
+ enum ifn_unique_kind k = ((enum ifn_unique_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
+ if (k != IFN_UNIQUE_OACC_PRIVATE
+ && k != IFN_UNIQUE_OACC_JOIN
+ && k != IFN_UNIQUE_OACC_FORK
+ && k != IFN_UNIQUE_OACC_HEAD_MARK
+ && k != IFN_UNIQUE_OACC_TAIL_MARK)
+ only_marker_fns = false;
+ else if (k == IFN_UNIQUE_OACC_JOIN)
+ /* The JOIN marker is special in that it *cannot* be
+ predicated for worker zero, because it may be lowered
+ to a barrier instruction and all workers must typically
+ execute that barrier. We shouldn't be doing any
+ broadcasts from the join block anyway. */
+ join_block = true;
+ }
+ else if (gimple_code (stmt) == GIMPLE_CALL
+ && gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
+ /* Empty. */;
+ else if (gimple_nop_p (stmt))
+ /* Empty. */;
+ else
+ only_marker_fns = false;
+ }
+
+ /* We can skip predicating this block for worker zero if the only
+ thing it contains is marker functions that will be removed in the
+ oaccdevlow pass anyway.
+ Don't do this if the block has (any) phi nodes, because those
+ might define SSA names that need broadcasting.
+ TODO: We might be able to skip transforming blocks that only
+ contain some other trivial statements too. */
+ if (only_marker_fns && !phi_nodes (block))
+ continue;
+
+ gcc_assert (!join_block);
if (has_defs)
{
+ tree record_type = (tree) block->aux;
auto off_rngalloc = blk_offset_map->get (block);
gcc_assert (!record_type || off_rngalloc);
unsigned HOST_WIDE_INT offset
= off_rngalloc ? off_rngalloc->first : 0;
bool range_allocated
= off_rngalloc ? off_rngalloc->second : true;
+ bool has_gangprivate_write
+ = bitmap_bit_p (writes_gangprivate, block->index);
worker_single_copy (block, block, &def_escapes_block,
&worker_partitioned_uses, record_type,
- offset, !range_allocated);
+ offset, !range_allocated,
+ has_gangprivate_write);
}
else
worker_single_simple (block, block, &def_escapes_block);
@@ -1394,10 +1452,12 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
if (par->inner)
neuter_worker_single (par->inner, mask, worker_single, vector_single,
- prop_set, partitioned_var_uses, blk_offset_map);
+ prop_set, partitioned_var_uses, blk_offset_map,
+ writes_gangprivate);
if (par->next)
neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
- prop_set, partitioned_var_uses, blk_offset_map);
+ prop_set, partitioned_var_uses, blk_offset_map,
+ writes_gangprivate);
}
@@ -1595,11 +1655,13 @@ oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
hash_set<tree> partitioned_var_uses;
hash_set<tree> gangprivate_vars;
+ auto_bitmap writes_gangprivate;
find_gangprivate_vars (&gangprivate_vars);
find_partitioned_var_uses (par, mask, &partitioned_var_uses);
find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
- &gangprivate_vars, &prop_set);
+ &gangprivate_vars, writes_gangprivate,
+ &prop_set);
FOR_ALL_BB_FN (bb, cfun)
{
@@ -1747,7 +1809,8 @@ oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
sbitmap_vector_free (reachable);
neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
- &partitioned_var_uses, &blk_offset_map);
+ &partitioned_var_uses, &blk_offset_map,
+ writes_gangprivate);
prop_set.release ();