@@ -38,8 +38,9 @@ extern rtx gcn_full_exec_reg ();
extern rtx gcn_gen_undef (machine_mode);
extern bool gcn_global_address_p (rtx);
extern tree gcn_goacc_create_propagation_record (tree record_type, bool sender,
- const char *name);
-extern void gcn_goacc_adjust_private_decl (tree var, int level);
+ const char *name,
+ unsigned HOST_WIDE_INT offset);
+extern tree gcn_goacc_adjust_private_decl (tree var, int level);
extern void gcn_goacc_reduction (gcall *call);
extern bool gcn_hard_regno_rename_ok (unsigned int from_reg,
unsigned int to_reg);
@@ -448,14 +448,12 @@ gcn_goacc_get_worker_red_decl (tree type, unsigned offset)
}
else
{
- char name[50];
- sprintf (name, ".oacc_reduction_%u", offset);
- tree decl = create_tmp_var_raw (var_type, name);
+ gcc_assert (offset
+ < (machfun->reduction_limit - machfun->reduction_base));
+ tree ptr_type = build_pointer_type (var_type);
+ tree addr = build_int_cst (ptr_type, machfun->reduction_base + offset);
- DECL_CONTEXT (decl) = NULL_TREE;
- TREE_STATIC (decl) = 1;
-
- varpool_node::finalize_decl (decl);
+ tree decl = build_simple_mem_ref (addr);
vec_safe_grow_cleared (machfun->reduc_decls, offset + 1);
(*machfun->reduc_decls)[offset] = decl;
@@ -674,34 +672,36 @@ gcn_goacc_reduction (gcall *call)
tree
gcn_goacc_create_propagation_record (tree record_type, bool sender,
- const char *name)
+ const char *name,
+ unsigned HOST_WIDE_INT offset)
{
- tree type = record_type;
-
- TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS;
+ tree type = build_qualified_type (record_type,
+ TYPE_QUALS_NO_ADDR_SPACE (record_type)
+ | ENCODE_QUAL_ADDR_SPACE (ADDR_SPACE_LDS));
if (!sender)
- type = build_pointer_type (type);
-
- tree decl = create_tmp_var_raw (type, name);
-
- if (sender)
{
- DECL_CONTEXT (decl) = NULL_TREE;
- TREE_STATIC (decl) = 1;
+ tree ptr_type = build_pointer_type (type);
+ return create_tmp_var_raw (ptr_type, name);
}
- if (sender)
- varpool_node::finalize_decl (decl);
+ if (record_type == char_type_node)
+ offset = 1;
+
+ gcc_assert (cfun);
- return decl;
+ machine_function *machfun = cfun->machine;
+ unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
+
+ tree ptr_type = build_pointer_type (type);
+ return build_int_cst (ptr_type, offset);
}
-void
+tree
gcn_goacc_adjust_private_decl (tree var, int level)
{
if (level != GOMP_DIM_GANG)
- return;
+ return var;
tree type = TREE_TYPE (var);
tree lds_type = build_qualified_type (type,
@@ -720,6 +720,8 @@ gcn_goacc_adjust_private_decl (tree var, int level)
if (machfun)
machfun->use_flat_addressing = true;
+
+ return var;
}
/* }}} */
@@ -72,14 +72,21 @@ int gcn_isa = 3; /* Default to GCN3. */
We want to permit full occupancy, so size accordingly. */
+/* Use this as a default, but allow it to grow if the user requests a large
+ amount of gang-private shared-memory space. */
+static int acc_lds_size = 0x600;
+
#define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
-#define ACC_LDS_SIZE 32768 /* Half of the total should be fine. */
+#define ACC_LDS_SIZE acc_lds_size
#define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
#define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
: flag_openmp ? OMP_LDS_SIZE \
: OTHER_LDS_SIZE)
+static int gangprivate_hwm = 32;
+static hash_map<tree, int> lds_allocs;
+
/* The number of registers usable by normal non-kernel functions.
The SGPR count includes any special extra registers such as VCC. */
@@ -98,10 +105,6 @@ gcn_init_machine_status (void)
f = ggc_cleared_alloc<machine_function> ();
- /* Set up LDS allocation for broadcasting for this function. */
- f->lds_allocated = 32;
- f->lds_allocs = hash_map<tree, int>::create_ggc (64);
-
/* And LDS temporary decls for worker reductions. */
vec_alloc (f->reduc_decls, 0);
@@ -143,6 +146,24 @@ gcn_option_override (void)
/* 1MB total. */
stack_size_opt = 1048576;
}
+
+ /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
+ worker broadcasts. */
+ if (gang_local_size_opt == -1)
+ gang_local_size_opt = 512;
+ else if (gang_local_size_opt < gangprivate_hwm)
+ gang_local_size_opt = gangprivate_hwm;
+ else if (gang_local_size_opt >= acc_lds_size - 1024)
+ {
+ /* We need some space for reductions and worker broadcasting. If the
+ user requests a large amount of gang-private LDS space, we might not
+ have enough left for the former. Increase the LDS allocation in that
+ case, although this may reduce the maximum occupancy on the
+ hardware. */
+ acc_lds_size = gang_local_size_opt + 1024;
+ if (acc_lds_size > 32768)
+ acc_lds_size = 32768;
+ }
}
/* }}} */
@@ -2888,7 +2909,7 @@ gcn_expand_prologue ()
The low-part is the address of the topmost addressable byte, which is
size-1. The high-part is an offset and should be zero. */
emit_move_insn (gen_rtx_REG (SImode, M0_REG),
- gen_int_mode (LDS_SIZE-1, SImode));
+ gen_int_mode (LDS_SIZE, SImode));
emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
@@ -4920,6 +4941,28 @@ gcn_fixup_accel_lto_options (tree fndecl)
}
}
+/* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook. */
+
+static void
+gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
+ unsigned HOST_WIDE_INT *hi,
+ int ARG_UNUSED (dims[GOMP_DIM_MAX]),
+ unsigned HOST_WIDE_INT
+ ARG_UNUSED (private_size[GOMP_DIM_MAX]),
+ unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
+{
+ *lo = gang_local_size_opt + reduction_size[GOMP_DIM_WORKER];
+ /* !!! We can maybe use dims[] to estimate the maximum number of work
+ groups/wavefronts/etc. we will launch, and therefore tune the maximum
+ amount of LDS we should use. For now, use a minimal amount to try to
+ maximise occupancy. */
+ *hi = acc_lds_size;
+ machine_function *machfun = cfun->machine;
+ machfun->reduction_base = gang_local_size_opt;
+ machfun->reduction_limit
+ = gang_local_size_opt + reduction_size[GOMP_DIM_WORKER];
+}
+
/* }}} */
/* {{{ ASM Output. */
@@ -5220,9 +5263,12 @@ gcn_section_type_flags (tree decl, const char *name, int reloc)
/* Helper function for gcn_asm_output_symbol_ref.
- FIXME: If we want to have propagation blocks allocated separately and
- statically like this, it would be better done via symbol refs and the
- assembler/linker. This is a temporary hack. */
+ FIXME: This function is used to lay out gang-private variables in LDS
+ on a per-CU basis.
+ There may be cases in which gang-local variables in different compilation
+ units could clobber each other. In that case we should be relying on the
+ linker to lay out gang-private LDS space, but that doesn't appear to be
+ possible at present. */
static void
gcn_print_lds_decl (FILE *f, tree var)
@@ -5230,7 +5276,7 @@ gcn_print_lds_decl (FILE *f, tree var)
int *offset;
machine_function *machfun = cfun->machine;
- if ((offset = machfun->lds_allocs->get (var)))
+ if ((offset = lds_allocs.get (var)))
fprintf (f, "%u", (unsigned) *offset);
else
{
@@ -5240,14 +5286,14 @@ gcn_print_lds_decl (FILE *f, tree var)
if (size > align && size > 4 && align < 8)
align = 8;
- machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
- & ~(align - 1));
+ gangprivate_hwm = ((gangprivate_hwm + align - 1) & ~(align - 1));
- machfun->lds_allocs->put (var, machfun->lds_allocated);
- fprintf (f, "%u", machfun->lds_allocated);
- machfun->lds_allocated += size;
- if (machfun->lds_allocated > LDS_SIZE)
- error ("local data-share memory exhausted");
+ lds_allocs.put (var, gangprivate_hwm);
+ fprintf (f, "%u", gangprivate_hwm);
+ gangprivate_hwm += size;
+ if (gangprivate_hwm > gang_local_size_opt)
+ error ("gang-private data-share memory exhausted (increase with "
+ "-mgang-local-size=<number>)");
}
}
@@ -6170,6 +6216,8 @@ print_operand (FILE *file, rtx x, int code)
#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
#undef TARGET_GOACC_WORKER_PARTITIONING
#define TARGET_GOACC_WORKER_PARTITIONING true
+#undef TARGET_GOACC_SHARED_MEM_LAYOUT
+#define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
#undef TARGET_HARD_REGNO_MODE_OK
#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
#undef TARGET_HARD_REGNO_NREGS
@@ -571,9 +571,8 @@ struct GTY(()) machine_function
HOST_WIDE_INT local_vars;
HOST_WIDE_INT callee_saves;
- unsigned lds_allocated;
- hash_map<tree, int> *lds_allocs;
-
+ unsigned HOST_WIDE_INT reduction_base;
+ unsigned HOST_WIDE_INT reduction_limit;
vec<tree, va_gc> *reduc_decls;
bool use_flat_addressing;
@@ -73,6 +73,12 @@ Target Report RejectNegative Joined UInteger Var(stack_size_opt) Init(-1)
mlocal-symbol-id=
Target RejectNegative Report JoinedOrMissing Var(local_symbol_id) Init(0)
+int gang_local_size_opt = -1
+
+mgang-local-size=
+Target Report RejectNegative Joined UInteger Var(gang_local_size_opt) Init(-1)
+Amount of local data-share (LDS) memory to reserve for gang-local variables.
+
Wopenacc-dims
Target Var(warn_openacc_dims) Warning
Warn about invalid OpenACC dimensions.
@@ -6214,7 +6214,7 @@ memories. A return value of NULL indicates that the target does not
handle this VAR_DECL, and normal RTL expanding is resumed.
@end deftypefn
-@deftypefn {Target Hook} void TARGET_GOACC_ADJUST_PRIVATE_DECL (tree @var{var}, @var{int})
+@deftypefn {Target Hook} tree TARGET_GOACC_ADJUST_PRIVATE_DECL (tree @var{var}, @var{int})
Tweak variable declaration for a private variable at the specified
parallelism level.
@end deftypefn
@@ -6223,7 +6223,7 @@ parallelism level.
Use gimple transformation for worker neutering/broadcasting.
@end deftypevr
-@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_PROPAGATION_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name})
+@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_PROPAGATION_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name}, unsigned HOST_WIDE_INT @var{offset})
Create a record used to propagate local-variable state from an active
worker to other workers. A possible implementation might adjust the type
of REC to place the new variable in shared GPU memory.
@@ -6234,6 +6234,13 @@ Define this hook to TRUE if arguments to offload regions should be
exploded, i.e. passed as true arguments rather than in an argument array.
@end deftypefn
+@deftypefn {Target Hook} void TARGET_GOACC_SHARED_MEM_LAYOUT (unsigned HOST_WIDE_INT *@var{}, unsigned HOST_WIDE_INT *@var{}, @var{int[]}, unsigned @var{HOST_WIDE_INT[]}, unsigned @var{HOST_WIDE_INT[]})
+Lay out a fixed shared-memory region on the target. The LO and HI
+arguments should be set to a range of addresses that can be used for worker
+broadcasting. The dimensions, reduction size and gang-private size
+arguments are for the current offload region.
+@end deftypefn
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
@@ -4225,6 +4225,8 @@ address; but often a machine-dependent strategy can generate better code.
@hook TARGET_GOACC_EXPLODE_ARGS
+@hook TARGET_GOACC_SHARED_MEM_LAYOUT
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
@@ -1631,7 +1631,7 @@ maybe_discard_oacc_function (tree decl)
struct addr_expr_rewrite_info
{
gimple *stmt;
- hash_set<tree> *adjusted_vars;
+ hash_map<tree, tree> *adjusted_vars;
bool avoid_pointer_conversion;
bool modified;
};
@@ -1645,19 +1645,20 @@ rewrite_addr_expr (tree *tp, int *walk_subtrees, void *data)
if (TREE_CODE (*tp) == ADDR_EXPR)
{
tree arg = TREE_OPERAND (*tp, 0);
+ tree *new_arg = info->adjusted_vars->get (arg);
- if (info->adjusted_vars->contains (arg))
+ if (new_arg)
{
if (info->avoid_pointer_conversion)
{
- *tp = build_fold_addr_expr (arg);
+ *tp = build_fold_addr_expr (*new_arg);
info->modified = true;
*walk_subtrees = 0;
}
else
{
gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
- tree repl = build_fold_addr_expr (arg);
+ tree repl = build_fold_addr_expr (*new_arg);
gimple *stmt1
= gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
tree conv = convert_to_pointer (TREE_TYPE (*tp),
@@ -1672,6 +1673,15 @@ rewrite_addr_expr (tree *tp, int *walk_subtrees, void *data)
}
}
}
+ else if (TREE_CODE (*tp) == VAR_DECL)
+ {
+ tree *new_decl = info->adjusted_vars->get (*tp);
+ if (new_decl)
+ {
+ *tp = *new_decl;
+ info->modified = true;
+ }
+ }
return NULL_TREE;
}
@@ -1705,7 +1715,8 @@ is_sync_builtin_call (gcall *call)
tree
default_goacc_create_propagation_record (tree record_type, bool sender,
- const char *name)
+ const char *name,
+ unsigned HOST_WIDE_INT ARG_UNUSED (offset))
{
tree type = record_type;
@@ -1861,8 +1872,104 @@ execute_oacc_loop_designation ()
int
execute_oacc_gimple_workers (void)
{
- oacc_do_neutering ();
- calculate_dominance_info (CDI_DOMINATORS);
+ unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
+ unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
+
+ for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
+ {
+ reduction_size[i] = 0;
+ private_size[i] = 0;
+ }
+
+ /* Calculate shared memory size required for reduction variables and
+ gang-private memory for this offloaded function. */
+ basic_block bb;
+ FOR_ALL_BB_FN (bb, cfun)
+ {
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ if (!is_gimple_call (stmt))
+ continue;
+ gcall *call = as_a <gcall *> (stmt);
+ enum internal_fn ifn_code = gimple_call_internal_fn (call);
+ switch (ifn_code)
+ {
+ default: break;
+ case IFN_GOACC_REDUCTION:
+ if (integer_minus_onep (gimple_call_arg (call, 3)))
+ continue;
+ else
+ {
+ unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+ /* Only count reduction variables once: the choice to pick
+ the setup call is fairly arbitrary. */
+ if (code == IFN_GOACC_REDUCTION_SETUP)
+ {
+ int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+ tree var = gimple_call_arg (call, 2);
+ tree offset = gimple_call_arg (call, 5);
+ tree var_type = TREE_TYPE (var);
+ unsigned HOST_WIDE_INT limit
+ = tree_to_uhwi (offset)
+ + tree_to_uhwi (TYPE_SIZE_UNIT (var_type));
+ reduction_size[level]
+ = MAX (reduction_size[level], limit);
+ }
+ }
+ break;
+ case IFN_UNIQUE:
+ {
+ enum ifn_unique_kind kind
+ = ((enum ifn_unique_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
+
+ if (kind == IFN_UNIQUE_OACC_PRIVATE)
+ {
+ HOST_WIDE_INT level
+ = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+ if (level == -1)
+ break;
+ for (unsigned i = 3;
+ i < gimple_call_num_args (call);
+ i++)
+ {
+ tree arg = gimple_call_arg (call, i);
+ gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+ tree decl = TREE_OPERAND (arg, 0);
+ unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
+ private_size[level] = ((private_size[level] + align - 1)
+ & ~(align - 1));
+ unsigned HOST_WIDE_INT decl_size
+ = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
+ private_size[level] += decl_size;
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+
+ int dims[GOMP_DIM_MAX];
+
+ for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
+ dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
+
+ /* Find bounds of shared-memory buffer space we can use. */
+ unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
+ if (targetm.goacc.shared_mem_layout)
+ targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
+ private_size, reduction_size);
+
+ /* Perform worker partitioning unless we know the number of workers is 1. */
+ if (dims[GOMP_DIM_WORKER] != 1)
+ {
+ oacc_do_neutering (bounds_lo, bounds_hi);
+ calculate_dominance_info (CDI_DOMINATORS);
+ }
return 0;
}
@@ -1879,7 +1986,7 @@ execute_oacc_device_lower ()
for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
- hash_set<tree> adjusted_vars;
+ hash_map<tree, tree> adjusted_vars;
/* Now lower internal loop functions to target-specific code
sequences. */
@@ -1986,9 +2093,11 @@ execute_oacc_device_lower ()
if (targetm.goacc.adjust_private_decl)
{
tree oldtype = TREE_TYPE (decl);
- targetm.goacc.adjust_private_decl (decl, level);
- if (TREE_TYPE (decl) != oldtype)
- adjusted_vars.add (decl);
+ tree newdecl
+ = targetm.goacc.adjust_private_decl (decl, level);
+ if (TREE_TYPE (newdecl) != oldtype
+ || newdecl != decl)
+ adjusted_vars.put (decl, newdecl);
}
}
remove = true;
@@ -2163,22 +2272,9 @@ public:
{}
/* opt_pass methods: */
- virtual bool gate (function *)
+ virtual bool gate (function *fun)
{
- if (!flag_openacc || !targetm.goacc.worker_partitioning)
- return false;
-
- tree attr = oacc_get_fn_attrib (current_function_decl);
-
- if (!attr)
- /* Not an offloaded function. */
- return false;
-
- int worker_dim
- = oacc_get_fn_dim_size (current_function_decl, GOMP_DIM_WORKER);
-
- /* No worker partitioning if we know the number of workers is 1. */
- return worker_dim != 1;
+ return flag_openacc && oacc_get_fn_attrib (fun->decl);
};
virtual unsigned int execute (function *)
@@ -54,6 +54,8 @@
#include "omp-offload.h"
#include "attribs.h"
#include "omp-sese.h"
+#include "targhooks.h"
+#include "diagnostic-core.h"
/* Loop structure of the function. The entire function is described as
a NULL loop. */
@@ -967,6 +969,8 @@ static tree
build_sender_ref (tree record_type, tree var, tree sender_decl)
{
field_map_t *fields = *field_map->get (record_type);
+ if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
+ sender_decl = build_simple_mem_ref (sender_decl);
tree field = *fields->get (var);
return oacc_build_component_ref (sender_decl, field);
}
@@ -1004,7 +1008,8 @@ static void
worker_single_copy (basic_block from, basic_block to,
hash_set<tree> *def_escapes_block,
hash_set<tree> *worker_partitioned_uses,
- tree record_type)
+ tree record_type, unsigned HOST_WIDE_INT placement,
+ bool isolate_broadcasts)
{
/* If we only have virtual defs, we'll have no record type, but we still want
to emit single_copy_start and (particularly) single_copy_end to act as
@@ -1015,10 +1020,10 @@ worker_single_copy (basic_block from, basic_block to,
tree sender_decl
= targetm.goacc.create_propagation_record (record_type, true,
- ".oacc_worker_o");
+ ".oacc_worker_o", placement);
tree receiver_decl
= targetm.goacc.create_propagation_record (record_type, false,
- ".oacc_worker_i");
+ ".oacc_worker_i", placement);
gimple_stmt_iterator gsi = gsi_last_bb (to);
if (EDGE_COUNT (to->succs) > 1)
@@ -1032,12 +1037,23 @@ worker_single_copy (basic_block from, basic_block to,
tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
- gimple *call = gimple_build_call (decl, 1,
- build_fold_addr_expr (sender_decl));
+ gimple *call
+ = gimple_build_call (decl, 1,
+ POINTER_TYPE_P (TREE_TYPE (sender_decl))
+ ? sender_decl : build_fold_addr_expr (sender_decl));
gimple_call_set_lhs (call, lhs);
gsi_insert_before (&start, call, GSI_NEW_STMT);
update_stmt (call);
+ /* The shared-memory range for this block overflowed. Add a barrier before
+ the GOACC_single_copy_start call. */
+ if (isolate_broadcasts)
+ {
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+ gimple *acc_bar = gimple_build_call (decl, 0);
+ gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
+ }
+
tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
gimple *conv = gimple_build_assign (conv_tmp,
@@ -1206,13 +1222,26 @@ worker_single_copy (basic_block from, basic_block to,
}
}
+ /* The shared-memory range for this block overflowed. Add a barrier at the
+ end. */
+ if (isolate_broadcasts)
+ {
+ gsi = gsi_start_bb (exit_block);
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+ gimple *acc_bar = gimple_build_call (decl, 0);
+ gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
+ }
+
/* It's possible for the ET->DEST block (the work done by the active thread)
to finish with a control-flow insn, e.g. a UNIQUE function call. Split
the block and add SENDER_SEQ in the latter part to avoid having control
flow in the middle of a BB. */
decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
- call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl));
+ call = gimple_build_call (decl, 1,
+ POINTER_TYPE_P (TREE_TYPE (sender_decl))
+ ? sender_decl
+ : build_fold_addr_expr (sender_decl));
gimple_seq_add_stmt (&sender_seq, call);
gsi = gsi_last_bb (body);
@@ -1222,11 +1251,15 @@ worker_single_copy (basic_block from, basic_block to,
gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
}
+typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
+ blk_offset_map_t;
+
static void
neuter_worker_single (parallel_g *par, unsigned outer_mask,
bitmap worker_single, bitmap vector_single,
vec<propagation_set *> *prop_set,
- hash_set<tree> *partitioned_var_uses)
+ hash_set<tree> *partitioned_var_uses,
+ blk_offset_map_t *blk_offset_map)
{
unsigned mask = outer_mask | par->mask;
@@ -1315,8 +1348,17 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
tree record_type = (tree) block->aux;
if (has_defs)
- worker_single_copy (block, block, &def_escapes_block,
- &worker_partitioned_uses, record_type);
+ {
+ auto off_rngalloc = blk_offset_map->get (block);
+ gcc_assert (!record_type || off_rngalloc);
+ unsigned HOST_WIDE_INT offset
+ = off_rngalloc ? off_rngalloc->first : 0;
+ bool range_allocated
+ = off_rngalloc ? off_rngalloc->second : true;
+ worker_single_copy (block, block, &def_escapes_block,
+ &worker_partitioned_uses, record_type,
+ offset, !range_allocated);
+ }
else
worker_single_simple (block, block, &def_escapes_block);
}
@@ -1352,15 +1394,158 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
if (par->inner)
neuter_worker_single (par->inner, mask, worker_single, vector_single,
- prop_set, partitioned_var_uses);
+ prop_set, partitioned_var_uses, blk_offset_map);
if (par->next)
neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
- prop_set, partitioned_var_uses);
+ prop_set, partitioned_var_uses, blk_offset_map);
+}
+
+
+static void
+dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
+{
+ if (bb->flags & BB_VISITED)
+ return;
+
+ bb->flags |= BB_VISITED;
+
+ if (bb->succs)
+ {
+ edge e;
+ edge_iterator ei;
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ basic_block dest = e->dest;
+ if (dest->aux)
+ bitmap_set_bit (reachable, dest->index);
+ else
+ dfs_broadcast_reachable_1 (dest, reachable);
+ }
+ }
+}
+
+typedef std::pair<int, tree> idx_decl_pair_t;
+
+typedef auto_vec<splay_tree> used_range_vec_t;
+
+static int
+sort_size_descending (const void *a, const void *b)
+{
+ const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
+ const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
+ unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
+ unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
+ return bsize - asize;
+}
+
+class addr_range
+{
+public:
+ addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
+ : lo (addr_lo), hi (addr_hi)
+ { }
+ addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
+ { }
+ addr_range () : lo (0), hi (0)
+ { }
+
+ bool invalid () { return lo == 0 && hi == 0; }
+
+ unsigned HOST_WIDE_INT lo;
+ unsigned HOST_WIDE_INT hi;
+};
+
+static int
+splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
+{
+ addr_range *ar = (addr_range *) a;
+ addr_range *br = (addr_range *) b;
+ if (ar->lo == br->lo && ar->hi == br->hi)
+ return 0;
+ if (ar->hi <= br->lo)
+ return -1;
+ else if (ar->lo >= br->hi)
+ return 1;
+ return 0;
+}
+
+static void
+splay_tree_free_key (splay_tree_key k)
+{
+ addr_range *ar = (addr_range *) k;
+ delete ar;
}
+static addr_range
+first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
+ unsigned HOST_WIDE_INT align, addr_range *bounds)
+{
+ splay_tree_node min = splay_tree_min (s);
+ if (min)
+ {
+ splay_tree_node next;
+ while ((next = splay_tree_successor (s, min->key)))
+ {
+ unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
+ unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
+ unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
+ if (base + size <= hi)
+ return addr_range (base, base + size);
+ min = next;
+ }
+
+ unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
+ base = (base + align - 1) & ~(align - 1);
+ if (base + size <= bounds->hi)
+ return addr_range (base, base + size);
+ else
+ return addr_range ();
+ }
+ else
+ {
+ unsigned HOST_WIDE_INT lo = bounds->lo;
+ lo = (lo + align - 1) & ~(align - 1);
+ if (lo + size <= bounds->hi)
+ return addr_range (lo, lo + size);
+ else
+ return addr_range ();
+ }
+}
+
+static int
+merge_ranges_1 (splay_tree_node n, void *ptr)
+{
+ splay_tree accum = (splay_tree) ptr;
+ addr_range ar = *(addr_range *) n->key;
+
+ splay_tree_node old = splay_tree_lookup (accum, n->key);
+
+ /* We might have an overlap. Create a new range covering the
+ overlapping parts. */
+ if (old)
+ {
+ addr_range *old_ar = (addr_range *) old->key;
+ ar.lo = MIN (old_ar->lo, ar.lo);
+ ar.hi = MAX (old_ar->hi, ar.hi);
+ splay_tree_remove (accum, old->key);
+ }
+
+ addr_range *new_ar = new addr_range (ar);
+
+ splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
+
+ return 0;
+}
+
+static void
+merge_ranges (splay_tree accum, splay_tree sp)
+{
+ splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
+}
void
-oacc_do_neutering (void)
+oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
+ unsigned HOST_WIDE_INT bounds_hi)
{
bb_stmt_map_t bb_stmt_map;
auto_bitmap worker_single, vector_single;
@@ -1449,8 +1634,120 @@ oacc_do_neutering (void)
}
}
+ sbitmap *reachable
+ = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
+ last_basic_block_for_fn (cfun));
+
+ bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
+
+ auto_vec<std::pair<int, tree> > priority;
+
+ FOR_ALL_BB_FN (bb, cfun)
+ {
+ if (bb->aux)
+ {
+ tree record_type = (tree) bb->aux;
+
+ basic_block bb2;
+ FOR_ALL_BB_FN (bb2, cfun)
+ bb2->flags &= ~BB_VISITED;
+
+ priority.safe_push (std::make_pair (bb->index, record_type));
+ dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
+ }
+ }
+
+ sbitmap *inverted
+ = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
+ last_basic_block_for_fn (cfun));
+
+ bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
+
+ for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
+ {
+ sbitmap_iterator bi;
+ unsigned int j;
+ EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
+ bitmap_set_bit (inverted[j], i);
+ }
+
+ for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
+ bitmap_ior (reachable[i], reachable[i], inverted[i]);
+
+ sbitmap_vector_free (inverted);
+
+ used_range_vec_t used_ranges;
+
+ used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
+
+ blk_offset_map_t blk_offset_map;
+
+ addr_range worker_shm_bounds (bounds_lo, bounds_hi);
+
+ priority.qsort (sort_size_descending);
+ for (unsigned int i = 0; i < priority.length (); i++)
+ {
+ idx_decl_pair_t p = priority[i];
+ int blkno = p.first;
+ tree record_type = p.second;
+ HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
+ HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
+
+ splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
+ splay_tree_free_key, NULL);
+
+ if (!used_ranges[blkno])
+ used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
+ splay_tree_free_key, NULL);
+ else
+ merge_ranges (conflicts, used_ranges[blkno]);
+
+ sbitmap_iterator bi;
+ unsigned int j;
+ EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
+ if (used_ranges[j])
+ merge_ranges (conflicts, used_ranges[j]);
+
+ addr_range ar
+ = first_fit_range (conflicts, size, align, &worker_shm_bounds);
+
+ splay_tree_delete (conflicts);
+
+ if (ar.invalid ())
+ {
+ unsigned HOST_WIDE_INT base;
+ base = bounds_lo + random () % 512;
+ base = (base + align - 1) & ~(align - 1);
+ if (base + size > bounds_hi)
+ error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
+ auto base_inrng = std::make_pair (base, false);
+ blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
+ }
+ else
+ {
+ splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
+ (splay_tree_key) &ar);
+ if (old)
+ {
+ fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
+ "already mapped in block %d\n", (int) ar.lo,
+ (int) ar.hi, (int) ((addr_range *) old->key)->lo,
+ (int) ((addr_range *) old->key)->hi, blkno);
+ abort ();
+ }
+
+ addr_range *arp = new addr_range (ar);
+ splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
+ (splay_tree_value) blkno);
+ auto base_inrng = std::make_pair (ar.lo, true);
+ blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
+ }
+ }
+
+ sbitmap_vector_free (reachable);
+
neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
- &partitioned_var_uses);
+ &partitioned_var_uses, &blk_offset_map);
prop_set.release ();
@@ -27,6 +27,6 @@ typedef auto_vec<bb_pair_t> bb_pair_vec_t;
extern void omp_find_sese (auto_vec<basic_block> &blocks,
bb_pair_vec_t ®ions);
-extern void oacc_do_neutering (void);
+extern void oacc_do_neutering (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT);
#endif
@@ -1748,7 +1748,7 @@ DEFHOOK
(adjust_private_decl,
"Tweak variable declaration for a private variable at the specified\n\
parallelism level.",
-void, (tree var, int),
+tree, (tree var, int),
NULL)
DEFHOOK
@@ -1756,7 +1756,7 @@ DEFHOOK
"Create a record used to propagate local-variable state from an active\n\
worker to other workers. A possible implementation might adjust the type\n\
of REC to place the new variable in shared GPU memory.",
-tree, (tree rec, bool sender, const char *name),
+tree, (tree rec, bool sender, const char *name, unsigned HOST_WIDE_INT offset),
default_goacc_create_propagation_record)
DEFHOOK
@@ -1771,6 +1771,16 @@ DEFHOOKPOD
"Use gimple transformation for worker neutering/broadcasting.",
bool, false)
+DEFHOOK
+(shared_mem_layout,
+"Lay out a fixed shared-memory region on the target. The LO and HI\n\
+arguments should be set to a range of addresses that can be used for worker\n\
+broadcasting. The dimensions, reduction size and gang-private size\n\
+arguments are for the current offload region.",
+void, (unsigned HOST_WIDE_INT *, unsigned HOST_WIDE_INT *, int[],
+ unsigned HOST_WIDE_INT[], unsigned HOST_WIDE_INT[]),
+NULL)
+
HOOK_VECTOR_END (goacc)
/* Functions relating to vectorization. */
@@ -129,7 +129,8 @@ extern bool default_goacc_validate_dims (tree, int [], int, unsigned);
extern int default_goacc_dim_limit (int);
extern bool default_goacc_fork_join (gcall *, const int [], bool);
extern void default_goacc_reduction (gcall *);
-extern tree default_goacc_create_propagation_record (tree, bool, const char *);
+extern tree default_goacc_create_propagation_record (tree, bool, const char *,
+ unsigned HOST_WIDE_INT);
/* These are here, and not in hooks.[ch], because not all users of
hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS. */
new file mode 100644
@@ -0,0 +1,79 @@
+/* { dg-additional-options "-foffload=-mgang-local-size=64" } */
+
+#include <assert.h>
+#include <stdio.h>
+
+#define LOCAL(n) double n = input;
+#define LOCALS(n) LOCAL(n##1) LOCAL(n##2) LOCAL(n##3) LOCAL(n##4) \
+ LOCAL(n##5) LOCAL(n##6) LOCAL(n##7) LOCAL(n##8)
+#define LOCALS2(n) LOCALS(n##a) LOCALS(n##b) LOCALS(n##c) LOCALS(n##d) \
+ LOCALS(n##e) LOCALS(n##f) LOCALS(n##g) LOCALS(n##h)
+
+#define USE(n) n
+#define USES(n,OP) USE(n##1) OP USE(n##2) OP USE(n##3) OP USE (n##4) OP \
+ USE(n##5) OP USE(n##6) OP USE(n##7) OP USE (n##8)
+#define USES2(n,OP) USES(n##a,OP) OP USES(n##b,OP) OP USES(n##c,OP) OP \
+ USES(n##d,OP) OP USES(n##e,OP) OP USES(n##f,OP) OP \
+ USES(n##g,OP) OP USES(n##h,OP)
+
+int main (void)
+{
+ int ret;
+ int input = 1;
+
+ #pragma acc parallel num_gangs(1) num_workers(32) copyout(ret)
+ {
+ int w = 0;
+ LOCALS2(h);
+
+ #pragma acc loop worker reduction(+:w)
+ for (int i = 0; i < 32; i++)
+ {
+ int u = USES2(h,+);
+ w += u;
+ }
+
+ printf ("w=%d\n", w);
+ /* { dg-output "w=2048(\n|\r\n|\r)" } */
+
+ LOCALS2(i);
+
+ #pragma acc loop worker reduction(+:w)
+ for (int i = 0; i < 32; i++)
+ {
+ int u = USES2(i,+);
+ w += u;
+ }
+
+ printf ("w=%d\n", w);
+ /* { dg-output "w=4096(\n|\r\n|\r)" } */
+
+ LOCALS2(j);
+ LOCALS2(k);
+
+ #pragma acc loop worker reduction(+:w)
+ for (int i = 0; i < 32; i++)
+ {
+ int u = USES2(j,+);
+ w += u;
+ }
+
+ printf ("w=%d\n", w);
+ /* { dg-output "w=6144(\n|\r\n|\r)" } */
+
+ #pragma acc loop worker reduction(+:w)
+ for (int i = 0; i < 32; i++)
+ {
+ int u = USES2(k,+);
+ w += u;
+ }
+
+ ret = (w == 64 * 32 * 4);
+ printf ("w=%d\n", w);
+ /* { dg-output "w=8192(\n|\r\n|\r)" } */
+ }
+
+ assert (ret);
+
+ return 0;
+}