@@ -899,7 +899,8 @@ RTL_H = $(RTL_BASE_H) $(FLAGS_H) genrtl.h
READ_MD_H = $(OBSTACK_H) $(HASHTAB_H) read-md.h
PARAMS_H = params.h params-enum.h params.def
BUILTINS_DEF = builtins.def sync-builtins.def omp-builtins.def \
- gtm-builtins.def sanitizer.def cilkplus.def cilk-builtins.def
+ gtm-builtins.def sanitizer.def cilkplus.def cilk-builtins.def \
+ hsa-builtins.def
INTERNAL_FN_DEF = internal-fn.def
INTERNAL_FN_H = internal-fn.h $(INTERNAL_FN_DEF)
TREE_CORE_H = tree-core.h coretypes.h all-tree.def tree.def \
@@ -188,6 +188,16 @@ along with GCC; see the file COPYING3. If not see
|| flag_cilkplus \
|| flag_offload_abi != OFFLOAD_ABI_UNSET))
+#undef DEF_HSA_BUILTIN
+#ifdef ENABLE_HSA
+#define DEF_HSA_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
+ DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE, \
+ false, false, true, ATTRS, false, \
+ (!flag_disable_hsa))
+#else
+#define DEF_HSA_BUILTIN(ENUM, NAME, TYPE, ATTRS)
+#endif
+
/* Builtin used by implementation of Cilk Plus. Most of these are decomposed
by the compiler but a few are implemented in libcilkrts. */
#undef DEF_CILK_BUILTIN_STUB
@@ -932,6 +942,9 @@ DEF_GCC_BUILTIN (BUILT_IN_LINE, "LINE", BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
/* Offloading and Multi Processing builtins. */
#include "omp-builtins.def"
+/* Heterogeneous Systems Architecture. */
+#include "hsa-builtins.def"
+
/* Cilk keywords builtins. */
#include "cilk-builtins.def"
@@ -1234,6 +1234,17 @@ gfc_init_builtin_functions (void)
#undef DEF_GOMP_BUILTIN
}
+#ifdef ENABLE_HSA
+ if (!flag_disable_hsa)
+ {
+#undef DEF_HSA_BUILTIN
+#define DEF_HSA_BUILTIN(code, name, type, attr) \
+ gfc_define_builtin ("__builtin_" name, builtin_types[type], \
+ code, name, attr);
+#include "../hsa-builtins.def"
+ }
+#endif
+
gfc_define_builtin ("__builtin_trap", builtin_types[BT_FN_VOID],
BUILT_IN_TRAP, NULL, ATTR_NOTHROW_LEAF_LIST);
TREE_THIS_VOLATILE (builtin_decl_explicit (BUILT_IN_TRAP)) = 1;
new file mode 100644
@@ -0,0 +1,31 @@
+/* This file contains the definitions and documentation for the
+ Offloading and Multi Processing builtins used in the GNU compiler.
+ Copyright (C) 2005-2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+/* Before including this file, you should define a macro:
+
+ DEF_HSA_BUILTIN (ENUM, NAME, TYPE, ATTRS)
+
+ See builtins.def for details. */
+
+/* The reason why they aren't in gcc/builtins.def is that the Fortran front end
+ doesn't source those. */
+
+DEF_HSA_BUILTIN (BUILT_IN_HSA_GET_WORKITEM_ABSID, "hsa_get_workitem_absid",
+ BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
@@ -3722,15 +3722,11 @@ hsa_insn_basic::set_output_in_type (hsa_op_reg *dest, unsigned op_index,
HBB. */
static void
-query_hsa_grid (hsa_op_reg *dest, BrigType16_t opcode, int dimension,
+query_hsa_grid (hsa_op_reg *dest, BrigType16_t opcode, hsa_op_immed *dimension,
hsa_bb *hbb)
{
- /* We're using just one-dimensional kernels, so hard-coded
- dimension X. */
- hsa_op_immed *imm
- = new hsa_op_immed (dimension, (BrigKind16_t) BRIG_TYPE_U32);
hsa_insn_basic *insn = new hsa_insn_basic (2, opcode, BRIG_TYPE_U32, NULL,
- imm);
+ dimension);
hbb->append_insn (insn);
insn->set_output_in_type (dest, 0, hbb);
}
@@ -3739,7 +3735,7 @@ query_hsa_grid (hsa_op_reg *dest, BrigType16_t opcode, int dimension,
Instructions are appended to basic block HBB. */
static void
-query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, int dimension,
+query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, hsa_op_immed *dimension,
hsa_bb *hbb)
{
tree lhs = gimple_call_lhs (dyn_cast <gcall *> (stmt));
@@ -3751,6 +3747,18 @@ query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, int dimension,
query_hsa_grid (dest, opcode, dimension, hbb);
}
+/* Generate a special HSA-related instruction for gimple STMT.
+ Instructions are appended to basic block HBB. */
+
+static void
+query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, int dimension,
+ hsa_bb *hbb)
+{
+ hsa_op_immed *bdim = new hsa_op_immed (dimension,
+ (BrigKind16_t) BRIG_TYPE_U32);
+ query_hsa_grid (stmt, opcode, bdim, hbb);
+}
+
/* Emit instructions that set hsa_num_threads according to provided VALUE.
Instructions are appended to basic block HBB. */
@@ -5506,6 +5514,14 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
break;
}
+ case BUILT_IN_HSA_GET_WORKITEM_ABSID:
+ {
+ hsa_op_immed *bdim = new hsa_op_immed (gimple_call_arg (stmt, 0), true);
+ if (bdim->m_type != BRIG_TYPE_U32)
+ bdim->get_in_type (BRIG_TYPE_U32, hbb);
+ query_hsa_grid (stmt, BRIG_OPCODE_WORKITEMABSID, bdim, hbb);
+ break;
+ }
case BUILT_IN_OMP_GET_THREAD_NUM:
{
query_hsa_grid (stmt, BRIG_OPCODE_WORKITEMABSID, 0, hbb);
@@ -12733,7 +12733,6 @@ grid_get_kernel_launch_attributes (gimple_stmt_iterator *gsi,
gomp_target *tgt_stmt)
{
grid_create_kernel_launch_attr_types ();
- tree u32_one = build_one_cst (uint32_type_node);
tree lattrs = create_tmp_var (grid_attr_trees->kernel_launch_attributes_type,
"__kernel_launch_attrs");
@@ -12758,10 +12757,10 @@ grid_get_kernel_launch_attributes (gimple_stmt_iterator *gsi,
tree dimref = build3 (COMPONENT_REF, uint32_type_node, lattrs,
grid_attr_trees->kernel_lattrs_dimnum_decl, NULL_TREE);
- /* At this moment we cannot gridify a loop with a collapse clause. */
- /* TODO: Adjust when we support bigger collapse. */
- gcc_assert (max_dim == 0);
- gsi_insert_before (gsi, gimple_build_assign (dimref, u32_one), GSI_SAME_STMT);
+ gcc_checking_assert (max_dim <= 2);
+ tree dimensions = build_int_cstu (uint32_type_node, max_dim + 1);
+ gsi_insert_before (gsi, gimple_build_assign (dimref, dimensions),
+ GSI_SAME_STMT);
TREE_ADDRESSABLE (lattrs) = 1;
return build_fold_addr_expr (lattrs);
}
@@ -13409,53 +13408,59 @@ expand_omp_target (struct omp_region *region)
static void
grid_expand_omp_for_loop (struct omp_region *kfor)
{
- tree t, threadid;
- tree type, itype;
gimple_stmt_iterator gsi;
- tree n1, step;
- struct omp_for_data fd;
-
gomp_for *for_stmt = as_a <gomp_for *> (last_stmt (kfor->entry));
gcc_checking_assert (gimple_omp_for_kind (for_stmt)
== GF_OMP_FOR_KIND_GRID_LOOP);
+ size_t collapse = gimple_omp_for_collapse (for_stmt);
+ struct omp_for_data_loop *loops
+ = (struct omp_for_data_loop *)
+ alloca (gimple_omp_for_collapse (for_stmt)
+ * sizeof (struct omp_for_data_loop));
+
+ struct omp_for_data fd;
+
basic_block body_bb = FALLTHRU_EDGE (kfor->entry)->dest;
- gcc_assert (gimple_omp_for_collapse (for_stmt) == 1);
gcc_assert (kfor->cont);
- extract_omp_for_data (for_stmt, &fd, NULL);
-
- itype = type = TREE_TYPE (fd.loop.v);
- if (POINTER_TYPE_P (type))
- itype = signed_type_for (type);
+ extract_omp_for_data (for_stmt, &fd, loops);
gsi = gsi_start_bb (body_bb);
- n1 = fd.loop.n1;
- step = fd.loop.step;
- n1 = force_gimple_operand_gsi (&gsi, fold_convert (type, n1),
- true, NULL_TREE, true, GSI_SAME_STMT);
- step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
- true, NULL_TREE, true, GSI_SAME_STMT);
- threadid = build_call_expr (builtin_decl_explicit
- (BUILT_IN_OMP_GET_THREAD_NUM), 0);
- threadid = fold_convert (itype, threadid);
- threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
- true, GSI_SAME_STMT);
+ for (size_t dim = 0; dim < collapse; dim++)
+ {
+ tree type, itype;
+ itype = type = TREE_TYPE (fd.loops[dim].v);
+ if (POINTER_TYPE_P (type))
+ itype = signed_type_for (type);
- tree startvar = fd.loop.v;
- t = fold_build2 (MULT_EXPR, itype, threadid, step);
- if (POINTER_TYPE_P (type))
- t = fold_build_pointer_plus (n1, t);
- else
- t = fold_build2 (PLUS_EXPR, type, t, n1);
- t = fold_convert (type, t);
- t = force_gimple_operand_gsi (&gsi, t,
- DECL_P (startvar)
- && TREE_ADDRESSABLE (startvar),
- NULL_TREE, true, GSI_SAME_STMT);
- gassign *assign_stmt = gimple_build_assign (startvar, t);
- gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+ tree n1 = fd.loops[dim].n1;
+ tree step = fd.loops[dim].step;
+ n1 = force_gimple_operand_gsi (&gsi, fold_convert (type, n1),
+ true, NULL_TREE, true, GSI_SAME_STMT);
+ step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
+ true, NULL_TREE, true, GSI_SAME_STMT);
+ tree threadid = build_call_expr (builtin_decl_explicit
+ (BUILT_IN_HSA_GET_WORKITEM_ABSID), 1,
+ build_int_cstu (unsigned_type_node, dim));
+ threadid = fold_convert (itype, threadid);
+ threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+ tree startvar = fd.loops[dim].v;
+ tree t = fold_build2 (MULT_EXPR, itype, threadid, step);
+ if (POINTER_TYPE_P (type))
+ t = fold_build_pointer_plus (n1, t);
+ else
+ t = fold_build2 (PLUS_EXPR, type, t, n1);
+ t = fold_convert (type, t);
+ t = force_gimple_operand_gsi (&gsi, t,
+ DECL_P (startvar)
+ && TREE_ADDRESSABLE (startvar),
+ NULL_TREE, true, GSI_SAME_STMT);
+ gassign *assign_stmt = gimple_build_assign (startvar, t);
+ gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+ }
/* Remove the omp for statement */
gsi = gsi_last_bb (kfor->entry);
gsi_remove (&gsi, true);
@@ -14837,7 +14842,8 @@ lower_omp_for_lastprivate (struct omp_for_data *fd, gimple_seq *body_p,
tree n2 = fd->loop.n2;
if (fd->collapse > 1
&& TREE_CODE (n2) != INTEGER_CST
- && gimple_omp_for_combined_into_p (fd->for_stmt))
+ && gimple_omp_for_combined_into_p (fd->for_stmt)
+ && gimple_omp_for_kind (fd->for_stmt) != GF_OMP_FOR_KIND_GRID_LOOP)
{
struct omp_context *taskreg_ctx = NULL;
if (gimple_code (ctx->outer->stmt) == GIMPLE_OMP_FOR)
@@ -17324,13 +17330,13 @@ grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
"distribute construct\n ");
return false;
}
- if (dist->collapse > 1)
+ if (dist->collapse > 3)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, tloc,
"Will not turn target construct into a gridified GPGPU "
"kernel because the distribute construct contains "
- "collapse clause\n");
+ "collapse clause with parameter greater than 3\n");
return false;
}
struct omp_for_data fd;
@@ -17405,13 +17411,13 @@ grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
"loop\n");
return false;
}
- if (gfor->collapse > 1)
+ if (gfor->collapse > 3)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, tloc,
"Will not turn target construct into a gridified GPGPU "
"kernel because the inner loop contains collapse "
- "clause\n");
+ "clause with parameter greater than 3\n");
return false;
}
@@ -1148,18 +1148,43 @@ parse_target_attributes (void **input,
struct GOMP_kernel_launch_attributes *kla;
kla = (struct GOMP_kernel_launch_attributes *) *input;
*result = kla;
- if (kla->ndim != 1)
- GOMP_PLUGIN_fatal ("HSA does not yet support number of dimensions "
- "different from one.");
- if (kla->gdims[0] == 0)
- return false;
-
- HSA_DEBUG ("GOMP_OFFLOAD_run called with grid size %u and group size %u\n",
- kla->gdims[0], kla->wdims[0]);
+ if (kla->ndim == 0 || kla->ndim > 3)
+ GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
+ HSA_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
+ unsigned i;
+ for (i = 0; i < kla->ndim; i++)
+ {
+ HSA_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
+ kla->gdims[i], kla->wdims[i]);
+ if (kla->gdims[i] == 0)
+ return false;
+ }
return true;
}
+/* Return the group size given the requested GROUP size, GRID size and number
+ of grid dimensions NDIM. */
+
+static uint32_t
+get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
+{
+ if (group == 0)
+ {
+ /* TODO: Provide a default via environment or device characteristics. */
+ if (ndim == 1)
+ group = 64;
+ else if (ndim == 2)
+ group = 8;
+ else
+ group = 4;
+ }
+
+ if (group > grid)
+ group = grid;
+ return group;
+}
+
/* Return true if the HSA runtime can run function FN_PTR. */
bool
@@ -1232,19 +1257,36 @@ GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, void **args)
+ index % agent->command_q->size;
memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
- packet->setup |= (uint16_t) 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ packet->setup
+ |= (uint16_t) kla->ndim << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->grid_size_x = kla->gdims[0];
- uint32_t wgs = kla->wdims[0];
- if (wgs == 0)
- /* TODO: Provide a default via environment. */
- wgs = 64;
- else if (wgs > kla->gdims[0])
- wgs = kla->gdims[0];
- packet->workgroup_size_x = wgs;
- packet->grid_size_y = 1;
- packet->workgroup_size_y = 1;
- packet->grid_size_z = 1;
- packet->workgroup_size_z = 1;
+ packet->workgroup_size_x = get_group_size (kla->ndim, kla->gdims[0],
+ kla->wdims[0]);
+
+ if (kla->ndim >= 2)
+ {
+ packet->grid_size_y = kla->gdims[1];
+ packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
+ kla->wdims[1]);
+ }
+ else
+ {
+ packet->grid_size_y = 1;
+ packet->workgroup_size_y = 1;
+ }
+
+ if (kla->ndim == 3)
+ {
+ packet->grid_size_z = kla->gdims[2];
+ packet->workgroup_size_z = get_group_size (kla->ndim, kla->gdims[2],
+ kla->wdims[2]);
+ }
+ else
+ {
+ packet->grid_size_z = 1;
+ packet->workgroup_size_z = 1;
+ }
+
packet->private_segment_size = kernel->private_segment_size;
packet->group_segment_size = kernel->group_segment_size;
packet->kernel_object = kernel->object;