[hsa] Allow gridification of loop pre_bodies

Message ID	20151016172811.GD7998@virgil.suse.cz
State	New
Headers	show Return-Path: <gcc-patches-return-410413-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:subject:message-id:mime-version:content-type; q=dns; s= default; b=QGmlkuxjREONZgWuBmCcgzOIYXUnTpi74ByY4Pp7PJmlTkEEjOy1m dQWwYd6BGvD0BLSTZYbWViJkbUrFFp1kYQ9cMKaZcUmnwbBJbn7/v7mDel0BtVJV duDbiXJM3+z4dfxTiLRzsSuVjv+KdRqMXREgsT+c1CAjROyUeu0iIE= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Date: Fri, 16 Oct 2015 19:28:11 +0200 From: Martin Jambor <mjambor@suse.cz> To: GCC Patches <gcc-patches@gcc.gnu.org> Subject: [hsa] Allow gridification of loop pre_bodies Message-ID: <20151016172811.GD7998@virgil.suse.cz> Mail-Followup-To: GCC Patches <gcc-patches@gcc.gnu.org> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline User-Agent: Mutt/1.5.24 (2015-08-30)

diff --git a/gcc/gimple-walk.c b/gcc/gimple-walk.c index e62cf62..a91abf1 100644 --- a/gcc/gimple-walk.c +++ b/gcc/gimple-walk.c @@ -633,6 +633,7 @@ walk_gimple_stmt (gimple_stmt_iterator *gsi, walk_stmt_fn callback_stmt, case GIMPLE_OMP_SINGLE: case GIMPLE_OMP_TARGET: case GIMPLE_OMP_TEAMS: + case GIMPLE_OMP_GPUKERNEL: ret = walk_gimple_seq_mod (gimple_omp_body_ptr (stmt), callback_stmt, callback_op, wi); if (ret) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 4f6c833..383f34a 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -184,11 +184,6 @@ struct omp_context barriers should jump to during omplower pass. */ tree cancel_label; - /* When we are about to produce a special gridified copy of a target - construct for a GPU, the copy is stored here between scanning and - lowering. */ - gimple_seq kernel_seq; - /* What to do with variables with implicitly determined sharing attributes. */ enum omp_clause_default_kind default_kind; @@ -2654,292 +2649,6 @@ scan_omp_single (gomp_single *stmt, omp_context *outer_ctx) layout_type (ctx->record_type); } -/* If SEQ is a sequence containing only one statement or a bind statement which - itself contains only one statement, return that statement. Otherwise return - NULL. TARGET_LOC must be location of the target statement and NAME the name - of the currently processed statement, both are used for dumping. */ - -static gimple * -single_stmt_in_seq_skip_bind (gimple_seq seq, location_t target_loc, - const char *name) -{ - gimple *stmt; - bool loop; - do - { - if (!seq) - { - gcc_assert (name); - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, target_loc, - "Will not turn target construct into a simple " - "GPGPU kernel because %s construct has empty " - "body\n", - name); - return NULL; - } - - if (!gimple_seq_singleton_p (seq)) - { - gcc_assert (name); - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, target_loc, - "Will not turn target construct into a simple " - "GPGPU kernel because %s construct contains " - "multiple statements\n", name); - return NULL; - } - - stmt = gimple_seq_first_stmt (seq); - if (is_a <gbind *> (stmt)) - { - loop = true; - gbind *bind = as_a <gbind *> (stmt); - seq = gimple_bind_body (bind); - } - else - loop = false; - } - while (loop); - return stmt; -} - -/* If TARGET follows a pattern that can be turned into a GPGPU kernel, return - true, otherwise return false. In the case of success, also fill in - GROUP_SIZE_P with the requested group size or NULL if there is none. */ - -static bool -target_follows_kernelizable_pattern (gomp_target *target, tree *group_size_p) -{ - if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION) - return false; - - location_t tloc = gimple_location (target); - gimple *stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (target), tloc, - "target"); - if (!stmt) - return false; - gomp_teams *teams; - tree group_size = NULL; - if ((teams = dyn_cast <gomp_teams *> (stmt))) - { - tree clauses = gimple_omp_teams_clauses (teams); - while (clauses) - { - switch (OMP_CLAUSE_CODE (clauses)) - { - case OMP_CLAUSE_NUM_TEAMS: - /* TODO: Maybe this is not an insurmountable obstacle but it is - weird, let's deal with it later. */ - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a " - "simple GPGPU kernel because we cannot handle " - "num_teams clause of teams construct\n "); - return false; - case OMP_CLAUSE_THREAD_LIMIT: - group_size = OMP_CLAUSE_OPERAND (clauses, 0); - break; - default: - break; - } - clauses = OMP_CLAUSE_CHAIN (clauses); - } - - stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (teams), tloc, - "teams"); - if (!stmt) - return false; - gomp_for *dist = NULL; - if ((dist = dyn_cast <gomp_for *> (stmt))) - { - gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE); - if (!gimple_omp_for_combined_p (dist)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a " - "simple GPGPU kernel because we cannot handle " - "a standalone distribute construct\n "); - return false; - } - if (dist->collapse > 1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a simple " - "GPGPU kernel because the distribute construct " - "contains collapse clause\n"); - return false; - } - struct omp_for_data fd; - extract_omp_for_data (dist, &fd, NULL); - if (fd.chunk_size) - { - if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a " - "simple GPGPU kernel because the teams " - "thread limit is different from distribute " - "schedule chunk\n"); - return false; - } - group_size = fd.chunk_size; - } - stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (dist), tloc, - "distribute"); - } - } - - gomp_parallel *par; - if (!stmt || !(par = dyn_cast <gomp_parallel *> (stmt))) - return false; - - tree clauses = gimple_omp_parallel_clauses (par); - tree num_threads_clause = find_omp_clause (clauses, OMP_CLAUSE_NUM_THREADS); - if (num_threads_clause) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a " - "simple GPGPU kernel because there is a num_threads " - "clause of the parallel construct that " - "is likely to require looping \n"); - return false; - } - - stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (par), tloc, "parallel"); - /* FIXME: We are currently ignoring parallel sharing clauses and potentially - also sharing clauses of teams and distribute, if there are any. We need to - check they can be skipped. */ - gomp_for *gfor; - if (!stmt || !(gfor = dyn_cast <gomp_for *> (stmt))) - return false; - - if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a simple GPGPU " - "kernel because the inner loop is not a simple for " - "loop\n"); - return false; - } - if (gfor->collapse > 1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a simple GPGPU " - "kernel because the inner loop contains collapse " - "clause\n"); - return false; - } - if (gimple_omp_for_pre_body (gfor)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a simple GPGPU " - "kernel because the inner loop contains has a pre_body " - "statement sequencee\n"); - return false; - } - - clauses = gimple_omp_for_clauses (gfor); - tree for_sched_clause = find_omp_clause (clauses, OMP_CLAUSE_SCHEDULE); - - if (for_sched_clause - && OMP_CLAUSE_SCHEDULE_KIND (for_sched_clause) != OMP_CLAUSE_SCHEDULE_AUTO) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, tloc, - "Will not turn target construct into a simple GPGPU " - "kernel because the inner loop has non-automatic " - "scheduling clause\n"); - return false; - } - - *group_size_p = group_size; - return true; -} - -/* Given freshly copied top level kernel SEQ (which might a bind containing a - single gomp_parallel or gomp_teams, identify the individual components, mark - them as part of kernel and return the inner loop. */ - -static gomp_for * -find_mark_kernel_components (gimple_seq seq) -{ - location_t tloc = UNKNOWN_LOCATION; - gimple *stmt = single_stmt_in_seq_skip_bind (seq, tloc, NULL); - gomp_teams *teams = NULL; - gomp_for *dist = NULL; - if ((teams = dyn_cast <gomp_teams *> (stmt))) - { - gimple_omp_teams_set_kernel_phony (teams, true); - stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (teams), tloc, NULL); - gcc_checking_assert (stmt); - if ((dist = dyn_cast <gomp_for *> (stmt))) - { - gimple_omp_for_set_kernel_phony (dist, true); - stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (dist), tloc, - NULL); - gcc_checking_assert (stmt); - } - } - gomp_parallel *parallel = as_a <gomp_parallel *> (stmt); - gimple_omp_parallel_set_kernel_phony (parallel, true); - stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (parallel), tloc, NULL); - gomp_for *inner_loop = as_a <gomp_for *> (stmt); - gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_KERNEL_BODY); - return inner_loop; -} - -/* Analyze TARGET body during its scanning and if it contains a loop which can - and should be turned into a GPGPU kernel, copy it aside for lowering. */ - -static void -attempt_target_kernelization (gomp_target *target, omp_context *ctx) -{ - if (flag_disable_hsa_gridification) - return; - - if (!hsa_gen_requested_p ()) - return; - tree group_size; - if (!target_follows_kernelizable_pattern (target, &group_size)) - return; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, gimple_location (target), - "Target construct will be turned into a simple GPGPU " - "kernel\n"); - - ctx->kernel_seq = copy_gimple_seq_and_replace_locals - (gimple_omp_body (target)); - gomp_for *inner_loop = find_mark_kernel_components (ctx->kernel_seq); - - gbind *old_bind = as_a <gbind *> (gimple_seq_first (gimple_omp_body (target))); - gbind *new_bind = as_a <gbind *> (gimple_seq_first (ctx->kernel_seq)); - tree new_block = gimple_bind_block (new_bind); - tree enc_block = BLOCK_SUPERCONTEXT (gimple_bind_block (old_bind)); - BLOCK_CHAIN (new_block) = BLOCK_SUBBLOCKS (enc_block); - BLOCK_SUBBLOCKS (enc_block) = new_block; - BLOCK_SUPERCONTEXT (new_block) = enc_block; - - target->kernel_group_size = group_size; - size_t collapse = inner_loop->collapse; - target->kernel_collapse = collapse; - target->kernel_iter = ggc_cleared_vec_alloc<gimple_omp_for_iter> (collapse); - for (size_t i = 0; i < collapse; i++) - { - target->kernel_iter[i] = inner_loop->iter[i]; - scan_omp_op (&target->kernel_iter[i].initial, ctx); - scan_omp_op (&target->kernel_iter[i].final, ctx); - } -} - /* Scan a GIMPLE_OMP_TARGET. */ static void @@ -2962,7 +2671,13 @@ scan_omp_target (gomp_target *stmt, omp_context *outer_ctx) TYPE_NAME (ctx->record_type) = name; TYPE_ARTIFICIAL (ctx->record_type) = 1; - attempt_target_kernelization (stmt, ctx); + /* FIXME: Needs proper accessors. */ + for (size_t i = 0; i < stmt->kernel_collapse; i++) + { + scan_omp_op (&stmt->kernel_iter[i].initial, ctx); + scan_omp_op (&stmt->kernel_iter[i].final, ctx); + } + if (offloaded) { if (is_gimple_omp_oacc (stmt)) @@ -2988,8 +2703,6 @@ scan_omp_target (gomp_target *stmt, omp_context *outer_ctx) scan_sharing_clauses (clauses, ctx); scan_omp (gimple_omp_body_ptr (stmt), ctx); - if (ctx->kernel_seq) - scan_omp (&ctx->kernel_seq, ctx); if (TYPE_FIELDS (ctx->record_type) == NULL) ctx->record_type = ctx->receiver_decl = NULL; @@ -3025,6 +2738,11 @@ scan_omp_teams (gomp_teams *stmt, omp_context *outer_ctx) static bool check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx) { + if (ctx && gimple_code (ctx->stmt) == GIMPLE_OMP_GPUKERNEL) + /* GPUKERNEL is an artificial construct, nesting rules will be checked in + the original copy of its contents. */ + return true; + /* No nesting of non-OpenACC STMT (that is, an OpenMP one, or a GOMP builtin) inside an OpenACC CTX. */ if (!(is_gimple_omp (stmt) @@ -3529,6 +3247,7 @@ scan_omp_1_stmt (gimple_stmt_iterator *gsi, bool *handled_ops_p, case GIMPLE_OMP_TASKGROUP: case GIMPLE_OMP_ORDERED: case GIMPLE_OMP_CRITICAL: + case GIMPLE_OMP_GPUKERNEL: ctx = new_omp_context (stmt, ctx); scan_omp (gimple_omp_body_ptr (stmt), ctx); break; @@ -12247,15 +11966,6 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) { target_nesting_level++; lower_omp (&tgt_body, ctx); - if (ctx->kernel_seq) - { - lower_omp (&ctx->kernel_seq, ctx); - gimple_seq_add_stmt (&ctx->kernel_seq, - gimple_build_omp_return (false)); - gimple *gpukernel = gimple_build_omp_gpukernel (ctx->kernel_seq); - ctx->kernel_seq = NULL; - gimple_seq_add_stmt (&ctx->kernel_seq, gpukernel); - } target_nesting_level--; } else if (data_region) @@ -12500,8 +12210,6 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) new_body = tgt_body; if (offloaded || data_region) { - if (ctx->kernel_seq) - gimple_seq_add_seq (&new_body, ctx->kernel_seq); gimple_seq_add_stmt (&new_body, gimple_build_omp_return (false)); gimple_omp_set_body (stmt, new_body); } @@ -12586,6 +12294,17 @@ lower_omp_teams (gimple_stmt_iterator *gsi_p, omp_context *ctx) TREE_USED (block) = 1; } +/* Expand code within an artificial GPUKERNELS OMP construct. */ + +static void +lower_omp_gpukernel (gimple_stmt_iterator *gsi_p, omp_context *ctx) +{ + gimple *stmt = gsi_stmt (*gsi_p); + lower_omp (gimple_omp_body_ptr (stmt), ctx); + gimple_seq_add_stmt (gimple_omp_body_ptr (stmt), + gimple_build_omp_return (false)); +} + /* Callback for lower_omp_1. Return non-NULL if *tp needs to be regimplified. If DATA is non-NULL, lower_omp_1 is outside @@ -12734,6 +12453,11 @@ lower_omp_1 (gimple_stmt_iterator *gsi_p, omp_context *ctx) gcc_assert (ctx); lower_omp_teams (gsi_p, ctx); break; + case GIMPLE_OMP_GPUKERNEL: + ctx = maybe_lookup_ctx (stmt); + gcc_assert (ctx); + lower_omp_gpukernel (gsi_p, ctx); + break; case GIMPLE_CALL: tree fndecl; call_stmt = as_a <gcall *> (stmt); @@ -12823,7 +12547,415 @@ lower_omp (gimple_seq *body, omp_context *ctx) fold_stmt (&gsi); input_location = saved_location; } - + +/* If SEQ is a sequence containing only one statement or a bind statement which + itself contains only one statement, return that statement. Otherwise return + NULL. TARGET_LOC must be location of the target statement and NAME the name + of the currently processed statement, both are used for dumping. */ + +static gimple * +single_stmt_in_seq_skip_bind (gimple_seq seq, location_t target_loc, + const char *name) +{ + gimple *stmt; + bool loop; + do + { + if (!seq) + { + gcc_assert (name); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, target_loc, + "Will not turn target construct into a simple " + "GPGPU kernel because %s construct has empty " + "body\n", + name); + return NULL; + } + + if (!gimple_seq_singleton_p (seq)) + { + gcc_assert (name); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, target_loc, + "Will not turn target construct into a simple " + "GPGPU kernel because %s construct contains " + "multiple statements\n", name); + return NULL; + } + + stmt = gimple_seq_first_stmt (seq); + if (is_a <gbind *> (stmt)) + { + loop = true; + gbind *bind = as_a <gbind *> (stmt); + seq = gimple_bind_body (bind); + } + else + loop = false; + } + while (loop); + return stmt; +} + +/* Return true if all statements in SEQ are assignments to local variables. */ + +static bool +seq_only_contains_local_assignments (gimple_seq seq) +{ + if (!seq) + return true; + + gimple_stmt_iterator gsi; + for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi)); + if (!stmt) + return false; + tree lhs = gimple_assign_lhs (stmt); + if (TREE_CODE (lhs) != VAR_DECL + || is_global_var (lhs)) + return false; + } + return true; +} + +/* If TARGET follows a pattern that can be turned into a gridified GPGPU + kernel, return true, otherwise return false. In the case of success, also + fill in GROUP_SIZE_P with the requested group size or NULL if there is + none. */ + +static bool +target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p) +{ + if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION) + return false; + + location_t tloc = gimple_location (target); + gimple *stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (target), tloc, + "target"); + if (!stmt) + return false; + gomp_teams *teams; + tree group_size = NULL; + if ((teams = dyn_cast <gomp_teams *> (stmt))) + { + tree clauses = gimple_omp_teams_clauses (teams); + while (clauses) + { + switch (OMP_CLAUSE_CODE (clauses)) + { + case OMP_CLAUSE_NUM_TEAMS: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because we cannot " + "handle num_teams clause of teams " + "construct\n "); + return false; + case OMP_CLAUSE_THREAD_LIMIT: + group_size = OMP_CLAUSE_OPERAND (clauses, 0); + break; + default: + break; + } + clauses = OMP_CLAUSE_CHAIN (clauses); + } + + stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (teams), tloc, + "teams"); + if (!stmt) + return false; + gomp_for *dist = NULL; + if ((dist = dyn_cast <gomp_for *> (stmt))) + { + gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE); + if (!gimple_omp_for_combined_p (dist)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because we cannot " + "handle a standalone distribute construct\n "); + return false; + } + if (dist->collapse > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because the distribute " + "construct contains collapse clause\n"); + return false; + } + struct omp_for_data fd; + extract_omp_for_data (dist, &fd, NULL); + if (fd.chunk_size) + { + if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because the teams " + "thread limit is different from distribute " + "schedule chunk\n"); + return false; + } + group_size = fd.chunk_size; + } + stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (dist), tloc, + "distribute"); + } + } + + gomp_parallel *par; + if (!stmt || !(par = dyn_cast <gomp_parallel *> (stmt))) + return false; + + tree clauses = gimple_omp_parallel_clauses (par); + tree num_threads_clause = find_omp_clause (clauses, OMP_CLAUSE_NUM_THREADS); + if (num_threads_clause) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified" + "GPGPU kernel because there is a num_threads " + "clause of the parallel construct that " + "is likely to require looping \n"); + return false; + } + + stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (par), tloc, "parallel"); + gomp_for *gfor; + if (!stmt || !(gfor = dyn_cast <gomp_for *> (stmt))) + return false; + + if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because the inner loop is not a simple for " + "loop\n"); + return false; + } + if (gfor->collapse > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because the inner loop contains collapse " + "clause\n"); + return false; + } + + if (!seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because the inner loop pre_body contains" + "a complex instruction\n"); + return false; + } + + clauses = gimple_omp_for_clauses (gfor); + tree for_sched_clause = find_omp_clause (clauses, OMP_CLAUSE_SCHEDULE); + + if (for_sched_clause + && OMP_CLAUSE_SCHEDULE_KIND (for_sched_clause) != OMP_CLAUSE_SCHEDULE_AUTO) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because the inner loop has non-automatic " + "scheduling clause\n"); + return false; + } + + *group_size_p = group_size; + return true; +} + +/* Given freshly copied top level kernel SEQ (which might a bind containing a + single gomp_parallel or gomp_teams, identify the individual components, mark + them as part of kernel and return the inner loop. */ + +static gomp_for * +find_mark_kernel_components (gimple_seq seq) +{ + location_t tloc = UNKNOWN_LOCATION; + gimple *stmt = single_stmt_in_seq_skip_bind (seq, tloc, NULL); + gomp_teams *teams = NULL; + gomp_for *dist = NULL; + if ((teams = dyn_cast <gomp_teams *> (stmt))) + { + gimple_omp_teams_set_kernel_phony (teams, true); + stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (teams), tloc, NULL); + gcc_checking_assert (stmt); + if ((dist = dyn_cast <gomp_for *> (stmt))) + { + gimple_omp_for_set_kernel_phony (dist, true); + stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (dist), tloc, + NULL); + gcc_checking_assert (stmt); + } + } + gomp_parallel *parallel = as_a <gomp_parallel *> (stmt); + gimple_omp_parallel_set_kernel_phony (parallel, true); + stmt = single_stmt_in_seq_skip_bind (gimple_omp_body (parallel), tloc, NULL); + gomp_for *inner_loop = as_a <gomp_for *> (stmt); + gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_KERNEL_BODY); + return inner_loop; +} + +/* Operand walker, used to remap pre-body declarations according to a hash map + provided in DATA. */ + +static tree +remap_prebody_decls (tree *tp, int *walk_subtrees, void *data) +{ + tree t = *tp; + + if (DECL_P (t) || TYPE_P (t)) + *walk_subtrees = 0; + else + *walk_subtrees = 1; + + if (TREE_CODE (t) == VAR_DECL) + { + struct walk_stmt_info *wi = (struct walk_stmt_info *) data; + hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info; + tree *repl = declmap->get (t); + if (repl) + *tp = *repl; + } + return NULL_TREE; +} + +/* If TARGET points to a GOMP_TARGET which follows a gridifiable pattern, + create a GPU kernel for it. GSI must point to the same statement, TGT_BIND + is the bind into which temporaries inserted before TARGET should be + added. */ + +static tree +attempt_target_gridification (gomp_target *target, gimple_stmt_iterator *gsi, + gbind *tgt_bind) +{ + tree group_size; + if (!target || !target_follows_gridifiable_pattern (target, &group_size)) + return NULL_TREE; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, gimple_location (target), + "Target construct will be turned into a gridified GPGPU " + "kernel\n"); + + /* Copy target body to a GPUKERNEL construct: */ + gimple_seq kernel_seq = copy_gimple_seq_and_replace_locals + (gimple_omp_body (target)); + gomp_for *inner_loop = find_mark_kernel_components (kernel_seq); + + gbind *old_bind = as_a <gbind *> (gimple_seq_first (gimple_omp_body (target))); + gbind *new_bind = as_a <gbind *> (gimple_seq_first (kernel_seq)); + tree new_block = gimple_bind_block (new_bind); + tree enc_block = BLOCK_SUPERCONTEXT (gimple_bind_block (old_bind)); + BLOCK_CHAIN (new_block) = BLOCK_SUBBLOCKS (enc_block); + BLOCK_SUBBLOCKS (enc_block) = new_block; + BLOCK_SUPERCONTEXT (new_block) = enc_block; + gimple *gpukernel = gimple_build_omp_gpukernel (kernel_seq); + gimple_seq_add_stmt + (gimple_bind_body_ptr (as_a <gbind *> (gimple_omp_body (target))), + gpukernel); + + /* Copy loop pre-body before target: */ + hash_map<tree, tree> *declmap = new hash_map<tree, tree>; + gimple_seq prebody = gimple_omp_for_pre_body (inner_loop); + gimple_seq pretarget = NULL; + gimple_stmt_iterator pbi; + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (struct walk_stmt_info)); + wi.info = declmap; + for (pbi = gsi_start (prebody); !gsi_end_p (pbi); gsi_next (&pbi)) + { + gassign *stmt = as_a <gassign *> (gsi_stmt (pbi)); + tree lhs = gimple_assign_lhs (stmt); + tree repl = copy_var_decl (lhs, create_tmp_var_name (NULL), + TREE_TYPE (lhs)); + DECL_CONTEXT (repl) = current_function_decl; + gimple_bind_append_vars (tgt_bind, repl); + + declmap->put (lhs, repl); + gassign *copy = as_a <gassign *> (gimple_copy (stmt)); + walk_gimple_op (copy, remap_prebody_decls, &wi); + gimple_seq_add_stmt (&pretarget, copy); + } + gsi_insert_seq_before (gsi, pretarget, GSI_SAME_STMT); + + target->kernel_group_size = group_size; + size_t collapse = inner_loop->collapse; + target->kernel_collapse = collapse; + target->kernel_iter = ggc_cleared_vec_alloc<gimple_omp_for_iter> (collapse); + for (size_t i = 0; i < collapse; i++) + { + target->kernel_iter[i] = inner_loop->iter[i]; + walk_tree (&target->kernel_iter[i].initial, remap_prebody_decls, &wi, + NULL); + walk_tree (&target->kernel_iter[i].final, remap_prebody_decls, &wi, NULL); + } + + delete declmap; + return NULL_TREE; +} + +/* Walker function doing all the work for create_target_kernels. */ + +static tree +create_target_gpukernel_stmt (gimple_stmt_iterator *gsi, bool *handled_ops_p, + struct walk_stmt_info *incoming) +{ + *handled_ops_p = false; + + gimple *stmt = gsi_stmt (*gsi); + gomp_target *target = dyn_cast <gomp_target *> (stmt); + if (target) + { + gbind *tgt_bind = (gbind *) incoming->info; + gcc_checking_assert (tgt_bind); + attempt_target_gridification (target, gsi, tgt_bind); + return NULL_TREE; + } + gbind *bind = dyn_cast <gbind *> (stmt); + if (bind) + { + *handled_ops_p = true; + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + wi.info = bind; + walk_gimple_seq_mod (gimple_bind_body_ptr (bind), + create_target_gpukernel_stmt, NULL, &wi); + } + return NULL_TREE; +} + +/* Prepare all target constructs in BODY_P for GPU kernel generation, if they + follow a gridifiable pattern. All such targets will have their bodies + duplicated, with the new copy being put into a gpukernel. All + kernel-related construct within the gpukernel will be marked with phony + flags or kernel kinds. Moreover, some re-structuring is often needed, such + as copying pre-bodies before the target construct so that kernel grid sizes + can be computed. */ + +static void +create_target_gpukernels (gimple_seq *body_p) +{ + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + walk_gimple_seq_mod (body_p, create_target_gpukernel_stmt, NULL, &wi); +} + + /* Main entry point. */ static unsigned int @@ -12843,6 +12975,10 @@ execute_lower_omp (void) delete_omp_context); body = gimple_body (current_function_decl); + + if (hsa_gen_requested_p () && !flag_disable_hsa_gridification) + create_target_gpukernels (&body); + scan_omp (&body, NULL); gcc_assert (taskreg_nesting_level == 0); FOR_EACH_VEC_ELT (taskreg_contexts, i, ctx)

[hsa] Allow gridification of loop pre_bodies

Commit Message

Patch