@@ -163,6 +163,7 @@ enum gf_mask {
GF_OMP_FOR_COMBINED = 1 << 4,
GF_OMP_FOR_COMBINED_INTO = 1 << 5,
GF_OMP_FOR_GRID_PHONY = 1 << 6,
+ GF_OMP_FOR_GRID_GROUP_ITER = 1 << 7,
GF_OMP_TARGET_KIND_MASK = (1 << 4) - 1,
GF_OMP_TARGET_KIND_REGION = 0,
GF_OMP_TARGET_KIND_DATA = 1,
@@ -5124,6 +5125,26 @@ gimple_omp_for_set_grid_phony (gomp_for *omp_for, bool value)
omp_for->subcode &= ~GF_OMP_FOR_GRID_PHONY;
}
+/* Return true if iterations of a grid OMP_FOR statement correspond to HSA
+ groups. */
+
+static inline bool
+gimple_omp_for_grid_group_iter (const gomp_for *omp_for)
+{
+ return (gimple_omp_subcode (omp_for) & GF_OMP_FOR_GRID_GROUP_ITER) != 0;
+}
+
+/* Set group_iter flag of OMP_FOR to VALUE. */
+
+static inline void
+gimple_omp_for_set_grid_group_iter (gomp_for *omp_for, bool value)
+{
+ if (value)
+ omp_for->subcode |= GF_OMP_FOR_GRID_GROUP_ITER;
+ else
+ omp_for->subcode &= ~GF_OMP_FOR_GRID_GROUP_ITER;
+}
+
/* Return the clauses associated with OMP_PARALLEL GS. */
static inline tree
@@ -3297,8 +3297,8 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
else if (gimple_code (ctx->stmt) == GIMPLE_OMP_TEAMS)
{
if ((gimple_code (stmt) != GIMPLE_OMP_FOR
- || (gimple_omp_for_kind (stmt)
- != GF_OMP_FOR_KIND_DISTRIBUTE))
+ || ((gimple_omp_for_kind (stmt) != GF_OMP_FOR_KIND_DISTRIBUTE)
+ && (gimple_omp_for_kind (stmt) != GF_OMP_FOR_KIND_GRID_LOOP)))
&& gimple_code (stmt) != GIMPLE_OMP_PARALLEL)
{
error_at (gimple_location (stmt),
@@ -13505,10 +13505,12 @@ expand_omp_target (struct omp_region *region)
}
/* Expand KFOR loop as a GPGPU kernel, i.e. as a body only with iteration
- variable derived from the thread number. */
+ variable derived from the thread number. INTRA_GROUP means this is an
+ expansion of a loop iterating over work-items within a separate iteration
+ over groups. */
static void
-grid_expand_omp_for_loop (struct omp_region *kfor)
+grid_expand_omp_for_loop (struct omp_region *kfor, bool intra_group)
{
gimple_stmt_iterator gsi;
gomp_for *for_stmt = as_a <gomp_for *> (last_stmt (kfor->entry));
@@ -13522,6 +13524,7 @@ grid_expand_omp_for_loop (struct omp_region *kfor)
struct omp_for_data fd;
+ remove_edge (BRANCH_EDGE (kfor->entry));
basic_block body_bb = FALLTHRU_EDGE (kfor->entry)->dest;
gcc_assert (kfor->cont);
@@ -13542,9 +13545,22 @@ grid_expand_omp_for_loop (struct omp_region *kfor)
true, NULL_TREE, true, GSI_SAME_STMT);
step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
true, NULL_TREE, true, GSI_SAME_STMT);
- tree threadid = build_call_expr (builtin_decl_explicit
- (BUILT_IN_HSA_GET_WORKITEM_ABSID), 1,
- build_int_cstu (unsigned_type_node, dim));
+ tree threadid;
+ if (gimple_omp_for_grid_group_iter (for_stmt))
+ {
+ gcc_checking_assert (!intra_group);
+ threadid = build_call_expr (builtin_decl_explicit
+ (BUILT_IN_HSA_GET_WORKGROUP_ID), 1,
+ build_int_cstu (unsigned_type_node, dim));
+ }
+ else if (intra_group)
+ threadid = build_call_expr (builtin_decl_explicit
+ (BUILT_IN_HSA_GET_WORKITEM_ID), 1,
+ build_int_cstu (unsigned_type_node, dim));
+ else
+ threadid = build_call_expr (builtin_decl_explicit
+ (BUILT_IN_HSA_GET_WORKITEM_ABSID), 1,
+ build_int_cstu (unsigned_type_node, dim));
threadid = fold_convert (itype, threadid);
threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
true, GSI_SAME_STMT);
@@ -13573,10 +13589,12 @@ grid_expand_omp_for_loop (struct omp_region *kfor)
&& gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_CONTINUE);
gsi_remove (&gsi, true);
- /* Replace the GIMPLE_OMP_RETURN with a real return. */
+ /* Replace the GIMPLE_OMP_RETURN with a barrier, if necessary. */
gsi = gsi_last_bb (kfor->exit);
gcc_assert (!gsi_end_p (gsi)
&& gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
+ if (intra_group)
+ gsi_insert_before (&gsi, build_omp_barrier (NULL_TREE), GSI_SAME_STMT);
gsi_remove (&gsi, true);
/* Fixup the much simpler CFG. */
@@ -13657,11 +13675,29 @@ grid_expand_target_grid_body (struct omp_region *target)
struct omp_region *kfor = *pp;
gcc_assert (kfor);
- gcc_assert (gimple_omp_for_kind (last_stmt ((kfor)->entry))
- == GF_OMP_FOR_KIND_GRID_LOOP);
+ gomp_for *for_stmt = as_a <gomp_for *> (last_stmt (kfor->entry));
+ gcc_assert (gimple_omp_for_kind (for_stmt) == GF_OMP_FOR_KIND_GRID_LOOP);
*pp = kfor->next;
if (kfor->inner)
- expand_omp (kfor->inner);
+ {
+ if (gimple_omp_for_grid_group_iter (for_stmt))
+ {
+ struct omp_region **next_pp;
+ for (pp = &kfor->inner; *pp; pp = next_pp)
+ {
+ next_pp = &(*pp)->next;
+ if ((*pp)->type != GIMPLE_OMP_FOR)
+ continue;
+ gomp_for *inner = as_a <gomp_for *> (last_stmt ((*pp)->entry));
+ gcc_assert (gimple_omp_for_kind (inner)
+ == GF_OMP_FOR_KIND_GRID_LOOP);
+ grid_expand_omp_for_loop (*pp, true);
+ *pp = (*pp)->next;
+ next_pp = pp;
+ }
+ }
+ expand_omp (kfor->inner);
+ }
if (gpukernel->inner)
expand_omp (gpukernel->inner);
@@ -13690,8 +13726,7 @@ grid_expand_target_grid_body (struct omp_region *target)
struct function *kern_cfun = DECL_STRUCT_FUNCTION (kern_fndecl);
kern_cfun->curr_properties = cfun->curr_properties;
- remove_edge (BRANCH_EDGE (kfor->entry));
- grid_expand_omp_for_loop (kfor);
+ grid_expand_omp_for_loop (kfor, false);
/* Remove the omp for statement */
gimple_stmt_iterator gsi = gsi_last_bb (gpukernel->entry);
@@ -17164,60 +17199,90 @@ lower_omp (gimple_seq *body, omp_context *ctx)
input_location = saved_location;
}
-/* Returen true if STMT is an assignment of a register-type into a local
- VAR_DECL. */
+/* Structure describing the basic properties of the loop we ara analyzing
+ whether it can be gridified and when it is gridified. */
+
+struct grid_prop
+{
+ /* True when we are doing tiling gridification, i.e. when there is a distinct
+ distribute loop over groups and a loop construct over work-items. False
+ when distribute and parallel for loops form a combined construct. */
+ bool tiling;
+ /* Location of the target construct for optimization information
+ messages. */
+ location_t target_loc;
+ /* The collapse clause of the involved loops. Collapse value of all of them
+ must be the same for gridification to take place. */
+ size_t collapse;
+ /* Group sizes, if requested by the user or NULL if not requested. */
+ tree group_sizes[3];
+};
+
+#define GRID_MISSED_MSG_PREFIX "Will not turn target construct into a " \
+ "gridified GPGPU kernel because "
+
+/* Return true if STMT is an assignment of a register-type into a local
+ VAR_DECL. If GRID is non-NULL, the assignment additionally must not be to
+ any of the trees specifying group sizes there. */
static bool
-grid_reg_assignment_to_local_var_p (gimple *stmt)
+grid_safe_assignment_p (gimple *stmt, grid_prop *grid)
{
gassign *assign = dyn_cast <gassign *> (stmt);
if (!assign)
return false;
+ if (gimple_clobber_p (assign))
+ return true;
tree lhs = gimple_assign_lhs (assign);
if (TREE_CODE (lhs) != VAR_DECL
|| !is_gimple_reg_type (TREE_TYPE (lhs))
|| is_global_var (lhs))
return false;
+ if (grid)
+ for (unsigned i = 0; i < grid->collapse; i++)
+ if (lhs == grid->group_sizes[i])
+ return false;
return true;
}
/* Return true if all statements in SEQ are assignments to local register-type
- variables. */
+ variables that do not hold group size information. */
static bool
-grid_seq_only_contains_local_assignments (gimple_seq seq)
+grid_seq_only_contains_local_assignments (gimple_seq seq, grid_prop *grid)
{
if (!seq)
return true;
gimple_stmt_iterator gsi;
for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
- if (!grid_reg_assignment_to_local_var_p (gsi_stmt (gsi)))
+ if (!grid_safe_assignment_p (gsi_stmt (gsi), grid))
return false;
return true;
}
-/* Scan statements in SEQ and call itself recursively on any bind. If during
- whole search only assignments to register-type local variables and one
- single OMP statement is encountered, return true, otherwise return false.
- RET is where we store any OMP statement encountered. TARGET_LOC and NAME
- are used for dumping a note about a failure. */
+/* Scan statements in SEQ and call itself recursively on any bind. GRID
+ describes hitherto discovered properties of the loop that is evaluated for
+ possible gridification. If during whole search only assignments to
+ register-type local variables (that do not overwrite group size information)
+ and one single OMP statement is encountered, return true, otherwise return
+ false. RET is where we store any OMP statement encountered. */
static bool
-grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc,
- const char *name, gimple **ret)
+grid_find_single_omp_among_assignments_1 (gimple_seq seq, grid_prop *grid,
+ const char *name, gimple **ret)
{
gimple_stmt_iterator gsi;
for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
{
gimple *stmt = gsi_stmt (gsi);
- if (grid_reg_assignment_to_local_var_p (stmt))
+ if (grid_safe_assignment_p (stmt, grid))
continue;
if (gbind *bind = dyn_cast <gbind *> (stmt))
{
if (!grid_find_single_omp_among_assignments_1 (gimple_bind_body (bind),
- target_loc, name, ret))
+ grid, name, ret))
return false;
}
else if (is_gimple_omp (stmt))
@@ -17225,10 +17290,18 @@ grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc,
if (*ret)
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, target_loc,
- "Will not turn target construct into a simple "
- "GPGPU kernel because %s construct contains "
- "multiple OpenMP constructs\n", name);
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "%s construct "
+ "contains multiple OpenMP constructs\n",
+ name);
+ dump_printf_loc (MSG_NOTE, gimple_location (*ret),
+ "The first OpenMP construct within "
+ "a parallel\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+ "The second OpenMP construct within "
+ "a parallel\n");
+ }
return false;
}
*ret = stmt;
@@ -17236,10 +17309,14 @@ grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc,
else
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, target_loc,
- "Will not turn target construct into a simple "
- "GPGPU kernel because %s construct contains "
- "a complex statement\n", name);
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "%s construct contains "
+ "a complex statement\n", name);
+ dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+ "This statement cannot be analyzed for "
+ "gridification\n");
+ }
return false;
}
}
@@ -17247,33 +17324,32 @@ grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc,
}
/* Scan statements in SEQ and make sure that it and any binds in it contain
- only assignments to local register-type variables and one OMP construct. If
- so, return that construct, otherwise return NULL. If dumping is enabled and
- function fails, use TARGET_LOC and NAME to dump a note with the reason for
- failure. */
+ only assignments to local register-type variables (that do not overwrite
+ group size information) and one OMP construct. If so, return that
+ construct, otherwise return NULL. GRID describes hitherto discovered
+ properties of the loop that is evaluated for possible gridification. If
+ dumping is enabled and function fails, use NAME to dump a note with the
+ reason for failure. */
static gimple *
-grid_find_single_omp_among_assignments (gimple_seq seq, location_t target_loc,
+grid_find_single_omp_among_assignments (gimple_seq seq, grid_prop *grid,
const char *name)
{
if (!seq)
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, target_loc,
- "Will not turn target construct into a simple "
- "GPGPU kernel because %s construct has empty "
- "body\n",
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "%s construct has empty body\n",
name);
return NULL;
}
gimple *ret = NULL;
- if (grid_find_single_omp_among_assignments_1 (seq, target_loc, name, &ret))
+ if (grid_find_single_omp_among_assignments_1 (seq, grid, name, &ret))
{
if (!ret && dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, target_loc,
- "Will not turn target construct into a simple "
- "GPGPU kernel because %s construct does not contain"
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "%s construct does not contain"
"any other OpenMP construct\n", name);
return ret;
}
@@ -17333,157 +17409,128 @@ grid_find_ungridifiable_statement (gimple_stmt_iterator *gsi,
return NULL;
}
-
-/* If TARGET follows a pattern that can be turned into a gridified GPGPU
- kernel, return true, otherwise return false. In the case of success, also
- fill in GROUP_SIZE_P with the requested group size or NULL if there is
- none. */
+/* Examine clauses of omp parallel statement PAR and if any prevents
+ gridification, issue a missed-optimization diagnostics and return false,
+ otherwise return true. GRID describes hitherto discovered properties of the
+ loop that is evaluated for possible gridification. */
static bool
-grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p)
+grid_parallel_clauses_gridifiable (gomp_parallel *par, location_t tloc)
{
- if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION)
- return false;
-
- location_t tloc = gimple_location (target);
- gimple *stmt
- = grid_find_single_omp_among_assignments (gimple_omp_body (target),
- tloc, "target");
- if (!stmt)
- return false;
- gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
- tree group_size = NULL;
- if (!teams)
- {
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a simple "
- "GPGPU kernel because it does not have a sole teams "
- "construct in it.\n");
- return false;
- }
-
- tree clauses = gimple_omp_teams_clauses (teams);
+ tree clauses = gimple_omp_parallel_clauses (par);
while (clauses)
{
switch (OMP_CLAUSE_CODE (clauses))
{
- case OMP_CLAUSE_NUM_TEAMS:
+ case OMP_CLAUSE_NUM_THREADS:
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because we cannot "
- "handle num_teams clause of teams "
- "construct\n ");
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "because there is "
+ "a num_threads clause of the parallel "
+ "construct\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (par),
+ "Parallel construct has a num_threads clause\n");
+ }
return false;
case OMP_CLAUSE_REDUCTION:
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because a reduction "
- "clause is present\n ");
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "a reduction clause"
+ "is present\n ");
+ dump_printf_loc (MSG_NOTE, gimple_location (par),
+ "Parallel construct has a reduction clause\n");
+ }
return false;
case OMP_CLAUSE_LASTPRIVATE:
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because a lastprivate "
- "clause is present\n ");
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "a lastprivate "
+ "clause is present\n ");
+ dump_printf_loc (MSG_NOTE, gimple_location (par),
+ "Parallel construct has a lastprivate clause\n");
+ }
return false;
- case OMP_CLAUSE_THREAD_LIMIT:
- group_size = OMP_CLAUSE_OPERAND (clauses, 0);
- break;
-
default:
break;
}
clauses = OMP_CLAUSE_CHAIN (clauses);
}
+ return true;
+}
- stmt = grid_find_single_omp_among_assignments (gimple_omp_body (teams), tloc,
- "teams");
- if (!stmt)
- return false;
- gomp_for *dist = dyn_cast <gomp_for *> (stmt);
- if (!dist)
- {
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a simple "
- "GPGPU kernel because the teams construct does not have "
- "a sole distribute construct in it.\n");
- return false;
- }
+/* Examine clauses and the body of omp loop statement GFOR and if something
+ prevents gridification, issue a missed-optimization diagnostics and return
+ false, otherwise return true. GRID describes hitherto discovered properties
+ of the loop that is evaluated for possible gridification. */
- gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE);
- if (!gimple_omp_for_combined_p (dist))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified GPGPU "
- "kernel because we cannot handle a standalone "
- "distribute construct\n ");
- return false;
- }
- if (dist->collapse > 3)
+static bool
+grid_inner_loop_gridifiable_p (gomp_for *gfor, grid_prop *grid)
+{
+ if (!grid_seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor),
+ grid))
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified GPGPU "
- "kernel because the distribute construct contains "
- "collapse clause with parameter greater than 3\n");
- return false;
- }
- struct omp_for_data fd;
- extract_omp_for_data (dist, &fd, NULL);
- if (fd.chunk_size)
- {
- if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because the teams "
- "thread limit is different from distribute "
- "schedule chunk\n");
- return false;
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the inner loop "
+ "loop bounds computation contains a complex "
+ "statement\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+ "Loop construct cannot be analyzed for "
+ "gridification\n");
}
- group_size = fd.chunk_size;
+ return false;
}
- stmt = grid_find_single_omp_among_assignments (gimple_omp_body (dist), tloc,
- "distribute");
- gomp_parallel *par;
- if (!stmt || !(par = dyn_cast <gomp_parallel *> (stmt)))
- return false;
- clauses = gimple_omp_parallel_clauses (par);
+ tree clauses = gimple_omp_for_clauses (gfor);
while (clauses)
{
switch (OMP_CLAUSE_CODE (clauses))
{
- case OMP_CLAUSE_NUM_THREADS:
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified"
- "GPGPU kernel because there is a num_threads "
- "clause of the parallel construct\n");
- return false;
+ case OMP_CLAUSE_SCHEDULE:
+ if (OMP_CLAUSE_SCHEDULE_KIND (clauses) != OMP_CLAUSE_SCHEDULE_AUTO)
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the inner loop "
+ "has a non-automatic schedule clause\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+ "Loop construct has a non automatic "
+ "schedule clause\n");
+ }
+ return false;
+ }
+ break;
case OMP_CLAUSE_REDUCTION:
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because a reduction "
- "clause is present\n ");
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "a reduction "
+ "clause is present\n ");
+ dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+ "Loop construct has a reduction schedule "
+ "clause\n");
+ }
return false;
case OMP_CLAUSE_LASTPRIVATE:
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because a lastprivate "
- "clause is present\n ");
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "a lastprivate "
+ "clause is present\n ");
+ dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+ "Loop construct has a lastprivate schedule "
+ "clause\n");
+ }
return false;
default:
@@ -17491,8 +17538,56 @@ grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
}
clauses = OMP_CLAUSE_CHAIN (clauses);
}
+ struct walk_stmt_info wi;
+ memset (&wi, 0, sizeof (wi));
+ if (walk_gimple_seq (gimple_omp_body (gfor),
+ grid_find_ungridifiable_statement,
+ NULL, &wi))
+ {
+ gimple *bad = (gimple *) wi.info;
+ if (dump_enabled_p ())
+ {
+ if (is_gimple_call (bad))
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the inner loop contains "
+ "call to a noreturn function\n");
+ else if (gimple_code (bad) == GIMPLE_OMP_FOR)
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the inner loop contains "
+ "a simd construct\n");
+ else
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the inner loop contains "
+ "statement %s which cannot be transformed\n",
+ gimple_code_name[(int) gimple_code (bad)]);
+ dump_printf_loc (MSG_NOTE, gimple_location (bad),
+ "This statement cannot be analyzed for "
+ "gridification\n");
+ }
+ return false;
+ }
+ return true;
+}
- stmt = grid_find_single_omp_among_assignments (gimple_omp_body (par), tloc,
+/* Given distribute omp construct represented by DIST, which in the original
+ source forms a compound construct with a looping construct, return true if it
+ can be turned into a gridified GPGPU kernel. Otherwise return false. GRID
+ describes hitherto discovered properties of the loop that is evaluated for
+ possible gridification. */
+
+static bool
+grid_dist_follows_simple_pattern (gomp_for *dist, grid_prop *grid)
+{
+ location_t tloc = grid->target_loc;
+ gimple *stmt = grid_find_single_omp_among_assignments (gimple_omp_body (dist),
+ grid, "distribute");
+ gomp_parallel *par;
+ if (!stmt
+ || !(par = dyn_cast <gomp_parallel *> (stmt))
+ || !grid_parallel_clauses_gridifiable (par, tloc))
+ return false;
+
+ stmt = grid_find_single_omp_among_assignments (gimple_omp_body (par), grid,
"parallel");
gomp_for *gfor;
if (!stmt || !(gfor = dyn_cast <gomp_for *> (stmt)))
@@ -17502,101 +17597,441 @@ grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified GPGPU "
- "kernel because the inner loop is not a simple for "
- "loop\n");
+ GRID_MISSED_MSG_PREFIX "the inner loop is not "
+ "a simple for loop\n");
return false;
}
- if (gfor->collapse > 3)
+ gcc_assert (gimple_omp_for_collapse (gfor) == grid->collapse);
+
+ if (!grid_inner_loop_gridifiable_p (gfor, grid))
+ return false;
+
+ return true;
+}
+
+/* Given an omp loop statement GFOR, return true if it can participate in
+ tiling gridification, i.e. in one where the distribute and parallel for
+ loops do not form a compound statement. GRID describes hitherto discovered
+ properties of the loop that is evaluated for possible gridification. */
+
+static bool
+grid_gfor_follows_tiling_pattern (gomp_for *gfor, grid_prop *grid)
+{
+ if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified GPGPU "
- "kernel because the inner loop contains collapse "
- "clause with parameter greater than 3\n");
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "an inner loop is not "
+ "a simple for loop\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+ "This statement is not a simple for loop\n");
+ }
return false;
}
- if (!grid_seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor)))
+ if (!grid_inner_loop_gridifiable_p (gfor, grid))
+ return false;
+
+ if (gimple_omp_for_collapse (gfor) != grid->collapse)
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified GPGPU "
- "kernel because the inner loop pre_body contains"
- "a complex instruction\n");
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "an inner loop does not "
+ "have use the same collapse clause\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+ "Loop construct uses a different collapse clause\n");
+ }
return false;
}
- clauses = gimple_omp_for_clauses (gfor);
- while (clauses)
+ struct omp_for_data fd;
+ struct omp_for_data_loop *loops
+ = (struct omp_for_data_loop *)alloca (grid->collapse
+ * sizeof (struct omp_for_data_loop));
+ extract_omp_for_data (gfor, &fd, loops);
+ for (unsigned i = 0; i < grid->collapse; i++)
{
- switch (OMP_CLAUSE_CODE (clauses))
+ tree itype, type = TREE_TYPE (fd.loops[i].v);
+ if (POINTER_TYPE_P (type))
+ itype = signed_type_for (type);
+ else
+ itype = type;
+
+ tree n1 = fold_convert (itype, fd.loops[i].n1);
+ tree n2 = fold_convert (itype, fd.loops[i].n2);
+ tree t = build_int_cst (itype,
+ (fd.loops[i].cond_code == LT_EXPR ? -1 : 1));
+ t = fold_build2 (PLUS_EXPR, itype, fd.loops[i].step, t);
+ t = fold_build2 (PLUS_EXPR, itype, t, n2);
+ t = fold_build2 (MINUS_EXPR, itype, t, n1);
+ if (TYPE_UNSIGNED (itype) && fd.loops[i].cond_code == GT_EXPR)
+ t = fold_build2 (TRUNC_DIV_EXPR, itype,
+ fold_build1 (NEGATE_EXPR, itype, t),
+ fold_build1 (NEGATE_EXPR, itype, fd.loops[i].step));
+ else
+ t = fold_build2 (TRUNC_DIV_EXPR, itype, t, fd.loops[i].step);
+
+ if (!operand_equal_p (grid->group_sizes[i], t, 0))
{
- case OMP_CLAUSE_SCHEDULE:
- if (OMP_CLAUSE_SCHEDULE_KIND (clauses) != OMP_CLAUSE_SCHEDULE_AUTO)
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the distribute and "
+ "an internal loop do not agree on tile size\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+ "Loop construct does not seem to loop over "
+ "a tile size\n");
+ }
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Facing a call to FNDECL in the body of a distribute construct, return true
+ if we can handle it or false if it precludes gridification. */
+
+static bool
+grid_call_permissible_in_distribute_p (tree fndecl)
+{
+ if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
+ return true;
+
+ const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+ if (strstr (name, "omp_") != name)
+ return false;
+
+ if ((strcmp (name, "omp_get_thread_num") == 0)
+ || (strcmp (name, "omp_get_num_threads") == 0)
+ || (strcmp (name, "omp_get_num_teams") == 0)
+ || (strcmp (name, "omp_get_team_num") == 0)
+ || (strcmp (name, "omp_get_level") == 0)
+ || (strcmp (name, "omp_get_active_level") == 0)
+ || (strcmp (name, "omp_in_parallel") == 0))
+ return true;
+
+ return false;
+}
+
+/* Facing a call satisfying grid_call_permissible_in_distribute_p in the body
+ of a distribute construct that is pointed at by GSI, modify it as necessary
+ for gridification. If the statement itself got removed, return true. */
+
+static bool
+grid_handle_call_in_distribute (gimple_stmt_iterator *gsi)
+{
+ gimple *stmt = gsi_stmt (*gsi);
+ tree fndecl = gimple_call_fndecl (stmt);
+ gcc_checking_assert (stmt);
+ if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
+ return false;
+
+ const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+ if ((strcmp (name, "omp_get_thread_num") == 0)
+ || (strcmp (name, "omp_get_level") == 0)
+ || (strcmp (name, "omp_get_active_level") == 0)
+ || (strcmp (name, "omp_in_parallel") == 0))
+ {
+ tree lhs = gimple_call_lhs (stmt);
+ if (lhs)
+ {
+ gassign *assign
+ = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+ }
+ gsi_remove (gsi, true);
+ return true;
+ }
+
+ /* The rest of the omp functions can stay as they are, HSA back-end will
+ handle them correctly. */
+ gcc_checking_assert ((strcmp (name, "omp_get_num_threads") == 0)
+ || (strcmp (name, "omp_get_num_teams") == 0)
+ || (strcmp (name, "omp_get_team_num") == 0));
+ return false;
+}
+
+/* Given a sequence of statements within a distribute omp construct or a
+ parallel construct, which in the original source does not form a compound
+ construct with a looping construct, return true if it does not prevent us
+ from turning it into a gridified GPGPU kernel. Otherwise return false. GRID
+ describes hitherto discovered properties of the loop that is evaluated for
+ possible gridification. IN_PARALLEL must be true if seq is within a
+ parallel construct and flase if it is only within a distribute
+ construct. */
+
+static bool
+grid_dist_follows_tiling_pattern (gimple_seq seq, grid_prop *grid,
+ bool in_parallel)
+{
+ gimple_stmt_iterator gsi;
+ for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+
+ if (grid_safe_assignment_p (stmt, grid)
+ || gimple_code (stmt) == GIMPLE_GOTO
+ || gimple_code (stmt) == GIMPLE_LABEL
+ || gimple_code (stmt) == GIMPLE_COND)
+ continue;
+ else if (gbind *bind = dyn_cast <gbind *> (stmt))
+ {
+ if (!grid_dist_follows_tiling_pattern (gimple_bind_body (bind),
+ grid, in_parallel))
+ return false;
+ continue;
+ }
+ else if (gtry *try_stmt = dyn_cast <gtry *> (stmt))
+ {
+ if (gimple_try_kind (try_stmt) == GIMPLE_TRY_CATCH)
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because the inner "
- "loop has a non-automatic scheduling clause\n");
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the distribute "
+ "construct contains a try..catch region\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (try_stmt),
+ "This statement cannot be analyzed for "
+ "tiled gridification\n");
+ }
return false;
}
- break;
+ if (!grid_dist_follows_tiling_pattern (gimple_try_eval (try_stmt),
+ grid, in_parallel))
+ return false;
+ if (!grid_dist_follows_tiling_pattern (gimple_try_cleanup (try_stmt),
+ grid, in_parallel))
+ return false;
+ continue;
+ }
+ else if (is_gimple_call (stmt))
+ {
+ tree fndecl = gimple_call_fndecl (stmt);
+ if (fndecl && grid_call_permissible_in_distribute_p (fndecl))
+ continue;
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the distribute "
+ "construct contains a call\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+ "This statement cannot be analyzed for "
+ "tiled gridification\n");
+ }
+ return false;
+ }
+ else if (gomp_parallel *par = dyn_cast <gomp_parallel *> (stmt))
+ {
+ if (in_parallel)
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "a parallel "
+ "construct contains another parallel "
+ "construct\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+ "This parallel construct is nested in "
+ "another one\n");
+ }
+ return false;
+ }
+ if (!grid_parallel_clauses_gridifiable (par, grid->target_loc)
+ || !grid_dist_follows_tiling_pattern (gimple_omp_body (par),
+ grid, true))
+ return false;
+ }
+ else if (gomp_for *gfor = dyn_cast <gomp_for *> (stmt))
+ {
+ if (!in_parallel)
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "a loop "
+ "construct is not nested within a parallel "
+ "construct\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+ "This loop construct is not nested in "
+ "a parallel construct\n");
+ }
+ return false;
+ }
+ if (!grid_gfor_follows_tiling_pattern (gfor, grid))
+ return false;
+ }
+ else
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+ GRID_MISSED_MSG_PREFIX "the distribute "
+ "construct contains a complex statement\n");
+ dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+ "This statement cannot be analyzed for "
+ "tiled gridification\n");
+ }
+ return false;
+ }
+ }
+ return true;
+}
+
+/* If TARGET follows a pattern that can be turned into a gridified GPGPU
+ kernel, return true, otherwise return false. In the case of success, also
+ fill in GROUP_SIZE_P with the requested group size or NULL if there is
+ none. */
+
+static bool
+grid_target_follows_gridifiable_pattern (gomp_target *target, grid_prop *grid)
+{
+ if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION)
+ return false;
+
+ location_t tloc = gimple_location (target);
+ grid->target_loc = tloc;
+ gimple *stmt
+ = grid_find_single_omp_among_assignments (gimple_omp_body (target),
+ grid, "target");
+ if (!stmt)
+ return false;
+ gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
+ tree group_size = NULL;
+ if (!teams)
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "it does not have a sole teams "
+ "construct in it.\n");
+ return false;
+ }
+
+ tree clauses = gimple_omp_teams_clauses (teams);
+ while (clauses)
+ {
+ switch (OMP_CLAUSE_CODE (clauses))
+ {
+ case OMP_CLAUSE_NUM_TEAMS:
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "the teams construct "
+ "contains a num_teams clause\n ");
+ return false;
case OMP_CLAUSE_REDUCTION:
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because a reduction "
+ GRID_MISSED_MSG_PREFIX "a reduction "
"clause is present\n ");
return false;
case OMP_CLAUSE_LASTPRIVATE:
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a "
- "gridified GPGPU kernel because a lastprivate "
+ GRID_MISSED_MSG_PREFIX "a lastprivate "
"clause is present\n ");
return false;
+ case OMP_CLAUSE_THREAD_LIMIT:
+ if (!integer_zerop (OMP_CLAUSE_OPERAND (clauses, 0)))
+ group_size = OMP_CLAUSE_OPERAND (clauses, 0);
+ break;
+
default:
break;
}
clauses = OMP_CLAUSE_CHAIN (clauses);
}
- struct walk_stmt_info wi;
- memset (&wi, 0, sizeof (wi));
- if (walk_gimple_seq (gimple_omp_body (gfor),
- grid_find_ungridifiable_statement,
- NULL, &wi))
+ stmt = grid_find_single_omp_among_assignments (gimple_omp_body (teams), grid,
+ "teams");
+ if (!stmt)
+ return false;
+ gomp_for *dist = dyn_cast <gomp_for *> (stmt);
+ if (!dist)
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "the teams construct does not "
+ "have a single distribute construct in it.\n");
+ return false;
+ }
+
+ gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE);
+
+ grid->collapse = gimple_omp_for_collapse (dist);
+ if (grid->collapse > 3)
{
- gimple *bad = (gimple *) wi.info;
if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "the distribute construct "
+ "contains collapse clause with parameter greater "
+ "than 3\n");
+ return false;
+ }
+
+ struct omp_for_data fd;
+ struct omp_for_data_loop *dist_loops
+ = (struct omp_for_data_loop *)alloca (grid->collapse
+ * sizeof (struct omp_for_data_loop));
+ extract_omp_for_data (dist, &fd, dist_loops);
+ if (fd.chunk_size)
+ {
+ if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0))
{
- if (is_gimple_call (bad))
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified "
- " GPGPU kernel because the inner loop contains "
- "call to a noreturn function\n");
- if (gimple_code (bad) == GIMPLE_OMP_FOR)
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified "
- " GPGPU kernel because the inner loop contains "
- "a simd construct\n");
- else
+ if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
- "Will not turn target construct into a gridified "
- "GPGPU kernel because the inner loop contains "
- "statement %s which cannot be transformed\n",
- gimple_code_name[(int) gimple_code (bad)]);
+ GRID_MISSED_MSG_PREFIX "the teams "
+ "thread limit is different from distribute "
+ "schedule chunk\n");
+ return false;
}
+ group_size = fd.chunk_size;
+ }
+ if (group_size && grid->collapse > 1)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "group size cannot be "
+ "set using thread_limit or schedule clauses "
+ "when also using a collapse clause greater than 1\n");
return false;
}
- *group_size_p = group_size;
- return true;
+ if (gimple_omp_for_combined_p (dist))
+ {
+ grid->tiling = false;
+ grid->group_sizes[0] = group_size;
+ for (unsigned i = 1; i < grid->collapse; i++)
+ grid->group_sizes[i] = NULL;
+ return grid_dist_follows_simple_pattern (dist, grid);
+ }
+ else
+ {
+ grid->tiling = true;
+ if (group_size)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+ GRID_MISSED_MSG_PREFIX "group size cannot be set "
+ "using thread_limit or schedule clauses when "
+ "distribute and loop constructs do not form "
+ "one combined construct\n");
+ return false;
+ }
+ for (unsigned i = 0; i < grid->collapse; i++)
+ {
+ if (fd.loops[i].cond_code == GT_EXPR)
+ grid->group_sizes[i] = fold_build1 (NEGATE_EXPR,
+ TREE_TYPE (fd.loops[i].step),
+ fd.loops[i].step);
+ else
+ grid->group_sizes[i] = fd.loops[i].step;
+ }
+ return grid_dist_follows_tiling_pattern (gimple_omp_body (dist), grid,
+ false);
+ }
}
/* Operand walker, used to remap pre-body declarations according to a hash map
@@ -17623,15 +18058,62 @@ grid_remap_prebody_decls (tree *tp, int *walk_subtrees, void *data)
return NULL_TREE;
}
+/* Identifiers of segments into which a particular variable should be places
+ when gridifying. */
+
+enum grid_var_segment {GRID_SEGMENT_PRIVATE, GRID_SEGMENT_GROUP,
+ GRID_SEGMENT_GLOBAL};
+
+/* Mark VAR so that it is eventually placed into SEGMENT. Place an artificial
+ builtin call into SEQ that will make sure the variable is always considered
+ address taken. */
+
+static void
+grid_mark_variable_segment (tree var, enum grid_var_segment segment)
+{
+ /* Making a non-addressable variables would require that we re-gimplify all
+ their uses. Fortunately, we do not have to do this because if they are
+ not addressable, it means they are not used in atomic or parallel
+ statements and so relaxed GPU consistency rules mean we can just keep them
+ private. */
+ if (!TREE_ADDRESSABLE (var))
+ return;
+
+ switch (segment)
+ {
+ case GRID_SEGMENT_GROUP:
+ DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_group_segment"),
+ NULL, DECL_ATTRIBUTES (var));
+ break;
+ case GRID_SEGMENT_GLOBAL:
+ DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_global_segment"),
+ NULL, DECL_ATTRIBUTES (var));
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (!TREE_STATIC (var))
+ {
+ TREE_STATIC (var) = 1;
+ varpool_node::finalize_decl (var);
+ }
+
+}
+
/* Copy leading register-type assignments to local variables in SRC to just
before DST, Creating temporaries, adjusting mapping of operands in WI and
remapping operands as necessary. Add any new temporaries to TGT_BIND.
- Return the first statement that does not conform to
- grid_reg_assignment_to_local_var_p or NULL. */
+ Return the first statement that does not conform to grid_safe_assignment_p
+ or NULL. If VAR_SEGMENT is not GRID_SEGMENT_PRIVATE, also mark all
+ variables in traversed bind statements so that they are put into the
+ appropriate segment. */
static gimple *
grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
- gbind *tgt_bind, struct walk_stmt_info *wi)
+ gbind *tgt_bind,
+ enum grid_var_segment var_segment,
+ struct walk_stmt_info *wi)
{
hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
gimple_stmt_iterator gsi;
@@ -17641,13 +18123,17 @@ grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
if (gbind *bind = dyn_cast <gbind *> (stmt))
{
gimple *r = grid_copy_leading_local_assignments
- (gimple_bind_body (bind), dst, tgt_bind, wi);
+ (gimple_bind_body (bind), dst, tgt_bind, var_segment, wi);
+
+ if (var_segment != GRID_SEGMENT_PRIVATE)
+ for (tree var = gimple_bind_vars (bind); var; var = DECL_CHAIN (var))
+ grid_mark_variable_segment (var, var_segment);
if (r)
return r;
else
continue;
}
- if (!grid_reg_assignment_to_local_var_p (stmt))
+ if (!grid_safe_assignment_p (stmt, NULL))
return stmt;
tree lhs = gimple_assign_lhs (as_a <gassign *> (stmt));
tree repl = copy_var_decl (lhs, create_tmp_var_name (NULL),
@@ -17663,43 +18149,108 @@ grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
return NULL;
}
+/* Statement walker function marking all parallels as grid_phony and loops as
+ grid ones representing threads of a particular thread group. */
+
+static tree
+grid_mark_tiling_loops (gimple_stmt_iterator *gsi,
+ bool *handled_ops_p,
+ struct walk_stmt_info *)
+{
+ *handled_ops_p = false;
+ if (gomp_for *loop = dyn_cast <gomp_for *> (gsi_stmt (*gsi)))
+ {
+ *handled_ops_p = true;
+ gimple_omp_for_set_kind (loop, GF_OMP_FOR_KIND_GRID_LOOP);
+ }
+ return NULL_TREE;
+}
+
+/* Statement walker function marking all parallels as grid_phony and loops as
+ grid ones representing threads of a particular thread group. */
+
+static tree
+grid_mark_tiling_parallels_and_loops (gimple_stmt_iterator *gsi,
+ bool *handled_ops_p,
+ struct walk_stmt_info *wi)
+{
+ *handled_ops_p = false;
+ wi->removed_stmt = false;
+ gimple *stmt = gsi_stmt (*gsi);
+ if (gbind *bind = dyn_cast <gbind *> (stmt))
+ {
+ for (tree var = gimple_bind_vars (bind); var; var = DECL_CHAIN (var))
+ grid_mark_variable_segment (var, GRID_SEGMENT_GROUP);
+ }
+ else if (gomp_parallel *parallel = dyn_cast <gomp_parallel *> (stmt))
+ {
+ *handled_ops_p = true;
+ gimple_omp_parallel_set_grid_phony (parallel, true);
+ walk_gimple_seq_mod (gimple_omp_body_ptr (parallel),
+ grid_mark_tiling_loops, NULL, wi);
+ }
+ else if (is_a <gcall *> (stmt))
+ wi->removed_stmt = grid_handle_call_in_distribute (gsi);
+ return NULL_TREE;
+}
+
/* Given freshly copied top level kernel SEQ, identify the individual OMP
- components, mark them as part of kernel and return the inner loop, and copy
- assignment leading to them just before DST, remapping them using WI and
- adding new temporaries to TGT_BIND. */
+ components, mark them as part of kernel, copy assignment leading to them
+ just before DST, remapping them using WI and adding new temporaries to
+ TGT_BIND, and and return the loop that will be used for kernel dispatch. */
static gomp_for *
-grid_process_kernel_body_copy (gimple_seq seq, gimple_stmt_iterator *dst,
+grid_process_kernel_body_copy (grid_prop *grid, gimple_seq seq,
+ gimple_stmt_iterator *dst,
gbind *tgt_bind, struct walk_stmt_info *wi)
{
- gimple *stmt = grid_copy_leading_local_assignments (seq, dst, tgt_bind, wi);
+ gimple *stmt = grid_copy_leading_local_assignments (seq, dst, tgt_bind,
+ GRID_SEGMENT_GLOBAL, wi);
gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
gcc_assert (teams);
gimple_omp_teams_set_grid_phony (teams, true);
stmt = grid_copy_leading_local_assignments (gimple_omp_body (teams), dst,
- tgt_bind, wi);
+ tgt_bind, GRID_SEGMENT_GLOBAL, wi);
gcc_checking_assert (stmt);
gomp_for *dist = dyn_cast <gomp_for *> (stmt);
gcc_assert (dist);
gimple_seq prebody = gimple_omp_for_pre_body (dist);
if (prebody)
- grid_copy_leading_local_assignments (prebody, dst, tgt_bind, wi);
- gimple_omp_for_set_grid_phony (dist, true);
- stmt = grid_copy_leading_local_assignments (gimple_omp_body (dist), dst,
- tgt_bind, wi);
- gcc_checking_assert (stmt);
+ grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
+ GRID_SEGMENT_GROUP, wi);
- gomp_parallel *parallel = as_a <gomp_parallel *> (stmt);
- gimple_omp_parallel_set_grid_phony (parallel, true);
- stmt = grid_copy_leading_local_assignments (gimple_omp_body (parallel), dst,
- tgt_bind, wi);
- gomp_for *inner_loop = as_a <gomp_for *> (stmt);
- gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_GRID_LOOP);
- prebody = gimple_omp_for_pre_body (inner_loop);
- if (prebody)
- grid_copy_leading_local_assignments (prebody, dst, tgt_bind, wi);
+ if (grid->tiling)
+ {
+ gimple_omp_for_set_kind (dist, GF_OMP_FOR_KIND_GRID_LOOP);
+ gimple_omp_for_set_grid_group_iter (dist, true);
- return inner_loop;
+ struct walk_stmt_info wi;
+ memset (&wi, 0, sizeof (wi));
+ walk_gimple_seq_mod (gimple_omp_body_ptr (dist),
+ grid_mark_tiling_parallels_and_loops, NULL, &wi);
+ return dist;
+ }
+ else
+ {
+ gimple_omp_for_set_grid_phony (dist, true);
+ stmt = grid_copy_leading_local_assignments (gimple_omp_body (dist), dst,
+ tgt_bind,
+ GRID_SEGMENT_PRIVATE, wi);
+ gcc_checking_assert (stmt);
+ gomp_parallel *parallel = as_a <gomp_parallel *> (stmt);
+ gimple_omp_parallel_set_grid_phony (parallel, true);
+ stmt = grid_copy_leading_local_assignments (gimple_omp_body (parallel),
+ dst, tgt_bind,
+ GRID_SEGMENT_PRIVATE, wi);
+ gomp_for *inner_loop = as_a <gomp_for *> (stmt);
+ gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_GRID_LOOP);
+ prebody = gimple_omp_for_pre_body (inner_loop);
+ if (prebody)
+ grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
+ GRID_SEGMENT_PRIVATE, wi);
+
+ return inner_loop;
+ }
}
/* If TARGET points to a GOMP_TARGET which follows a gridifiable pattern,
@@ -17712,8 +18263,10 @@ grid_attempt_target_gridification (gomp_target *target,
gimple_stmt_iterator *gsi,
gbind *tgt_bind)
{
- tree group_size;
- if (!target || !grid_target_follows_gridifiable_pattern (target, &group_size))
+ /* removed group_size */
+ grid_prop grid;
+ memset (&grid, 0, sizeof (grid));
+ if (!target || !grid_target_follows_gridifiable_pattern (target, &grid))
return;
location_t loc = gimple_location (target);
@@ -17732,8 +18285,8 @@ grid_attempt_target_gridification (gomp_target *target,
wi.info = declmap;
/* Copy assignments in between OMP statements before target, mark OMP
- statements within copy appropriatly. */
- gomp_for *inner_loop = grid_process_kernel_body_copy (kernel_seq, gsi,
+ statements within copy appropriately. */
+ gomp_for *inner_loop = grid_process_kernel_body_copy (&grid, kernel_seq, gsi,
tgt_bind, &wi);
gbind *old_bind = as_a <gbind *> (gimple_seq_first (gimple_omp_body (target)));
@@ -17748,10 +18301,10 @@ grid_attempt_target_gridification (gomp_target *target,
(gimple_bind_body_ptr (as_a <gbind *> (gimple_omp_body (target))),
gpukernel);
- walk_tree (&group_size, grid_remap_prebody_decls, &wi, NULL);
+ for (size_t i = 0; i < grid.collapse; i++)
+ walk_tree (&grid.group_sizes[i], grid_remap_prebody_decls, &wi, NULL);
push_gimplify_context ();
- size_t collapse = gimple_omp_for_collapse (inner_loop);
- for (size_t i = 0; i < collapse; i++)
+ for (size_t i = 0; i < grid.collapse; i++)
{
tree itype, type = TREE_TYPE (gimple_omp_for_index (inner_loop, i));
if (POINTER_TYPE_P (type))
@@ -17765,12 +18318,12 @@ grid_attempt_target_gridification (gomp_target *target,
tree n2 = unshare_expr (gimple_omp_for_final (inner_loop, i));
walk_tree (&n2, grid_remap_prebody_decls, &wi, NULL);
adjust_for_condition (loc, &cond_code, &n2);
- tree step;
- step = get_omp_for_step_from_incr (loc,
- gimple_omp_for_incr (inner_loop, i));
- gimple_seq tmpseq = NULL;
n1 = fold_convert (itype, n1);
n2 = fold_convert (itype, n2);
+
+ tree step
+ = get_omp_for_step_from_incr (loc, gimple_omp_for_incr (inner_loop, i));
+
tree t = build_int_cst (itype, (cond_code == LT_EXPR ? -1 : 1));
t = fold_build2 (PLUS_EXPR, itype, step, t);
t = fold_build2 (PLUS_EXPR, itype, t, n2);
@@ -17781,15 +18334,23 @@ grid_attempt_target_gridification (gomp_target *target,
fold_build1 (NEGATE_EXPR, itype, step));
else
t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+ if (grid.tiling)
+ {
+ if (cond_code == GT_EXPR)
+ step = fold_build1 (NEGATE_EXPR, itype, step);
+ t = fold_build2 (MULT_EXPR, itype, t, step);
+ }
+
tree gs = fold_convert (uint32_type_node, t);
+ gimple_seq tmpseq = NULL;
gimplify_expr (&gs, &tmpseq, NULL, is_gimple_val, fb_rvalue);
if (!gimple_seq_empty_p (tmpseq))
gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT);
tree ws;
- if (i == 0 && group_size)
+ if (grid.group_sizes[i])
{
- ws = fold_convert (uint32_type_node, group_size);
+ ws = fold_convert (uint32_type_node, grid.group_sizes[i]);
tmpseq = NULL;
gimplify_expr (&ws, &tmpseq, NULL, is_gimple_val, fb_rvalue);
if (!gimple_seq_empty_p (tmpseq))
new file mode 100644
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target offload_hsa } */
+/* { dg-options "-fopenmp -fdump-tree-omplower-details" } */
+
+#define BLOCK_SIZE 16
+
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+ const float*B, const int LDB, const float beta, float*C, const int LDC){
+
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+ for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
+ for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
+ {
+// Each team has a local copy of these mini matrices
+ float As[BLOCK_SIZE][BLOCK_SIZE];
+ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+#pragma omp parallel
+ {
+ int C_row, C_col;
+ float Cval = 0.0;
+
+ for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE )
+ {
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (kblock + col < K))
+ As[row][col] = A[(C_row*LDA)+ kblock + col];
+ else
+ As[row][col] = 0;
+ if ((kblock + row < K) && C_col < N)
+ Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+ else
+ Bs[row][col] = 0;
+ }
+
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ for (int e = 0; e < BLOCK_SIZE; ++e)
+ Cval += As[row][e] * Bs[e][col];
+ }
+ } /* End for kblock .. */
+
+
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (C_col < N))
+ C[(C_row*LDC)+C_col] = alpha*Cval + beta*C[(C_row*LDC)+C_col];
+
+ }
+ } /* end parallel */
+ } /* end target teams distribute */
+}
+
+/* { dg-final { scan-tree-dump "Target construct will be turned into a gridified GPGPU kernel" "omplower" } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target offload_hsa } */
+/* { dg-options "-fopenmp -fdump-tree-omplower-details" } */
+
+#define BLOCK_SIZE 16
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+ const float*B, const int LDB, const float beta, float*C, const int LDC)
+{
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+ for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
+ for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
+ {
+ float As[BLOCK_SIZE][BLOCK_SIZE];
+ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+ float Cs[BLOCK_SIZE][BLOCK_SIZE];
+ int C_row, C_col;
+
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ Cs[row][col] = 0.0;
+ }
+
+
+ for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE )
+ {
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (kblock + col < K))
+ As[row][col] = A[(C_row*LDA)+ kblock + col];
+ else
+ As[row][col] = 0;
+ if ((kblock + row < K) && C_col < N)
+ Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+ else
+ Bs[row][col] = 0;
+ }
+
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ for (int e = 0; e < BLOCK_SIZE; ++e)
+ Cs[row][col] += As[row][e] * Bs[e][col];
+ }
+ } /* End for kblock .. */
+
+
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (C_col < N))
+ C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
+ }
+ } /* End distribute */
+}
+
+/* { dg-final { scan-tree-dump "Target construct will be turned into a gridified GPGPU kernel" "omplower" } } */
new file mode 100644
@@ -0,0 +1,212 @@
+/*
+
+ matmul.c : Matrix Multiplication with tiling for openmp4 example
+
+*/
+
+#include <stdlib.h>
+#include <math.h>
+
+#define BLOCK_SIZE 16
+/*
+ #define BLOCK_SIZE 32
+*/
+#define NSECPERSEC 1000000000L
+
+typedef struct {
+ int width;
+ int height;
+ int stride;
+ int hpad;
+ float* elements;
+} Matrix;
+
+/* Correctly extract the number of nanoseconds from the two time structures */
+long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
+ long int nanosecs;
+ if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
+ ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
+ ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
+ else nanosecs =
+ (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
+ ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
+ return nanosecs;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+
+int verify(float* v_res, float* v_ref, int len) {
+ int passed = 1;
+ int i;
+ for (i = 0; i < len; ++i) {
+ if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
+ __builtin_abort ();
+ }
+ }
+ return passed;
+}
+
+
+int main(int argc, char* argv[]){
+
+ Matrix A,B,Bt,C,Cref;
+ int a1,a2,a3,i,j;
+ struct timespec start_time1, end_time1;
+ struct timespec start_time2, end_time2;
+ long int nanosecs,total_ops;
+ float gflopsTiled,gflopsCPU;
+
+ a1 = 35;
+ a2 = 28;
+ a3 = 47;
+
+ A.height = a1;
+ A.width = a2;
+ A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
+
+ B.height = a2;
+ B.width = a3;
+ B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
+
+ /* Bt is same as B but stored in column-major order */
+ Bt.height = B.height;
+ Bt.width = B.width;
+ Bt.stride = B.stride;
+ Bt.hpad = B.hpad;
+ Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
+
+ C.height = a1;
+ C.width = a3;
+ C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
+
+ Cref.height = a1;
+ Cref.width = a3;
+ Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
+
+ for(i = 0; i < A.hpad ; i++)
+ for(j = 0; j < A.stride; j++) {
+ if (( j<A.width ) && (i<A.height)) {
+ A.elements[i*A.stride + j] = (i % 3);
+ } else {
+ A.elements[i*A.stride + j] = 0.0;
+ }
+ }
+
+ /* Initialize B and Bt */
+ for(i = 0; i < B.hpad ; i++)
+ for(j = 0; j < B.stride; j++) {
+ if (( j<B.width ) && (i<B.height)) {
+ B.elements[i*B.stride+j] = (j % 2);
+ Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
+ } else {
+ B.elements[i*B.stride+j] = 0.0;
+ Bt.elements[j*Bt.stride+i] = 0.0;
+ }
+ }
+
+ /* zero C, and Cref */
+ for(i = 0; i < C.hpad; i++)
+ for(j = 0; j < C.stride; j++) {
+ C.elements[i*C.stride+j] = 0.0;
+ Cref.elements[i*Cref.stride+j] = 0.0;
+ }
+
+ simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
+ tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
+
+ verify(C.elements, Cref.elements, C.height * C.stride);
+ return 0;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+const float* B,const int LDB, const float beta,float* C, const int LDC) {
+ /* A,B, and C are in row-major order */
+ int c_row,c_col,inner;
+ float sum;
+ for (c_col = 0 ; c_col<N; c_col++ ) {
+ for (c_row = 0 ; c_row<M; c_row++ ) {
+ sum = 0.0 ;
+ for (inner = 0 ; inner<K; inner++ ) {
+ sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
+ }
+ C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
+ }
+ }
+}
+
+/***************************
+
+ tiled_sgemm_tt: Tiled matrix multiplication:
+
+***************************/
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+ const float*B, const int LDB, const float beta, float*C, const int LDC){
+
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+ for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
+ for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
+ {
+// Each team has a local copy of these mini matrices
+ float As[BLOCK_SIZE][BLOCK_SIZE];
+ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+#pragma omp parallel
+ {
+ int C_row, C_col;
+ float Cval = 0.0;
+
+ for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE )
+ {
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (kblock + col < K))
+ As[row][col] = A[(C_row*LDA)+ kblock + col];
+ else
+ As[row][col] = 0;
+ if ((kblock + row < K) && C_col < N)
+ Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+ else
+ Bs[row][col] = 0;
+ }
+
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ for (int e = 0; e < BLOCK_SIZE; ++e)
+ Cval += As[row][e] * Bs[e][col];
+ }
+ } /* End for kblock .. */
+
+
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (C_col < N))
+ C[(C_row*LDC)+C_col] = alpha*Cval + beta*C[(C_row*LDC)+C_col];
+
+ }
+ } /* end parallel */
+ } /* end target teams distribute */
+}
new file mode 100644
@@ -0,0 +1,258 @@
+/*
+
+ matmul.c : Matrix Multiplication with tiling for openmp4 example
+
+*/
+
+#include <stdlib.h>
+#include <math.h>
+
+#define BLOCK_SIZE 16
+/*
+ #define BLOCK_SIZE 32
+*/
+#define NSECPERSEC 1000000000L
+
+typedef struct {
+ int width;
+ int height;
+ int stride;
+ int hpad;
+ float* elements;
+} Matrix;
+
+/* Correctly extract the number of nanoseconds from the two time structures */
+long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
+ long int nanosecs;
+ if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
+ ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
+ ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
+ else nanosecs =
+ (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
+ ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
+ return nanosecs;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+
+int verify(float* v_res, float* v_ref, int len) {
+ int passed = 1;
+ int i;
+ for (i = 0; i < len; ++i) {
+ if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
+ __builtin_abort ();
+ }
+ }
+ return passed;
+}
+
+
+int main(int argc, char* argv[]){
+
+ Matrix A,B,Bt,C,Cref;
+ int a1,a2,a3,i,j;
+ struct timespec start_time1, end_time1;
+ struct timespec start_time2, end_time2;
+ long int nanosecs,total_ops;
+ float gflopsTiled,gflopsCPU;
+
+ a1 = 35;
+ a2 = 28;
+ a3 = 47;
+
+ A.height = a1;
+ A.width = a2;
+ A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
+
+ B.height = a2;
+ B.width = a3;
+ B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
+
+ /* Bt is same as B but stored in column-major order */
+ Bt.height = B.height;
+ Bt.width = B.width;
+ Bt.stride = B.stride;
+ Bt.hpad = B.hpad;
+ Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
+
+ C.height = a1;
+ C.width = a3;
+ C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
+
+ Cref.height = a1;
+ Cref.width = a3;
+ Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
+
+ for(i = 0; i < A.hpad ; i++)
+ for(j = 0; j < A.stride; j++) {
+ if (( j<A.width ) && (i<A.height)) {
+ A.elements[i*A.stride + j] = (i % 3);
+ } else {
+ A.elements[i*A.stride + j] = 0.0;
+ }
+ }
+
+ /* Initialize B and Bt */
+ for(i = 0; i < B.hpad ; i++)
+ for(j = 0; j < B.stride; j++) {
+ if (( j<B.width ) && (i<B.height)) {
+ B.elements[i*B.stride+j] = (j % 2);
+ Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
+ } else {
+ B.elements[i*B.stride+j] = 0.0;
+ Bt.elements[j*Bt.stride+i] = 0.0;
+ }
+ }
+
+ /* zero C, and Cref */
+ for(i = 0; i < C.hpad; i++)
+ for(j = 0; j < C.stride; j++) {
+ C.elements[i*C.stride+j] = 0.0;
+ Cref.elements[i*Cref.stride+j] = 0.0;
+ }
+
+ simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
+ tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
+
+ verify(C.elements, Cref.elements, C.height * C.stride);
+ return 0;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+const float* B,const int LDB, const float beta,float* C, const int LDC) {
+ /* A,B, and C are in row-major order */
+ int c_row,c_col,inner;
+ float sum;
+ for (c_col = 0 ; c_col<N; c_col++ ) {
+ for (c_row = 0 ; c_row<M; c_row++ ) {
+ sum = 0.0 ;
+ for (inner = 0 ; inner<K; inner++ ) {
+ sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
+ }
+ C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
+ }
+ }
+}
+
+/***************************
+
+ tiled_sgemm_tt: Tiled matrix multiplication:
+
+***************************/
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+ const float*B, const int LDB, const float beta, float*C, const int LDC){
+
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+ for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
+ for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {
+
+// We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE)
+// The grid global dimensions are M,N,1
+// The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1
+
+// -------------------------------------------------------------------
+// The rest of this code forms the HSAIL kernel with the
+// pairs of "paralell for collapse(2)" loops repalced with a barrier.
+// The kernel initializes these values
+// C_row_start = get_group_id(0) * BLOCK_SIZE
+// C_col_start = get_group_id(1) * BLOCK_SIZE
+// row=get_local_id(0)
+// col=get_local_id(1)
+// -------------------------------------------------------------------
+
+// Each team has a local copy of these mini matrices
+ float As[BLOCK_SIZE][BLOCK_SIZE];
+ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+ float Cs[BLOCK_SIZE][BLOCK_SIZE];
+ int C_row, C_col;
+
+ /* Zero Cs for this BLOCK */
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++) {
+ for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+ Cs[row][col] = 0.0;
+ }
+ }
+
+ // This kblock loop is run on the master thread of each team
+ for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE ) {
+
+ // Copy global memory values to local memory
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++) {
+ for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (kblock + col < K))
+ As[row][col] = A[(C_row*LDA)+ kblock + col];
+ else
+ As[row][col] = 0;
+ if ((kblock + row < K) && C_col < N)
+ Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+ else
+ Bs[row][col] = 0;
+ }
+ }
+
+ // Calculate Cs <- Sum(As X Bs) across all kblocks
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++) {
+ for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+ for (int e = 0; e < BLOCK_SIZE; ++e)
+ Cs[row][col] += As[row][e] * Bs[e][col];
+ }
+ }
+
+ } /* End for kblock .. */
+
+
+ // Scale Update actual C from Cs
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++) {
+ for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (C_col < N)) {
+ C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
+ }
+ }
+ }
+
+// -------------------------------------------------------------------
+// This is the end of the kernel
+
+ }
+ }
+
+}