@@ -1426,6 +1426,7 @@ OBJS = \
graphite-poly.o \
graphite-scop-detection.o \
graphite-sese-to-poly.o \
+ graphite-oacc.o \
gtype-desc.o \
haifa-sched.o \
hash-map-tests.o \
@@ -351,6 +351,7 @@ alloc_loop (void)
loop->exits = ggc_cleared_alloc<loop_exit> ();
loop->exits->next = loop->exits->prev = loop->exits;
loop->can_be_parallel = false;
+ loop->can_be_parallel_valid_p = false;
loop->constraints = 0;
loop->nb_iterations_upper_bound = 0;
loop->nb_iterations_likely_upper_bound = 0;
@@ -213,6 +213,12 @@ public:
/* True if the loop can be parallel. */
unsigned can_be_parallel : 1;
+ /* True if the can_be_parallel flag is valid, i.e. the
+ parallelizability of the loop has been analyzed. This can be
+ used to distinguish between unparallelizable loops and a failed
+ analysis, e.g. to provide better diagnostic messages. */
+ unsigned can_be_parallel_valid_p : 1;
+
/* True if -Waggressive-loop-optimizations warned about this loop
already. */
unsigned warned_aggressive_loop_optimizations : 1;
@@ -1017,6 +1017,7 @@ copy_loop_info (class loop *loop, class loop *target)
target->simdlen = loop->simdlen;
target->constraints = loop->constraints;
target->can_be_parallel = loop->can_be_parallel;
+ target->can_be_parallel_valid_p = loop->can_be_parallel_valid_p;
target->warned_aggressive_loop_optimizations
|= loop->warned_aggressive_loop_optimizations;
target->dont_vectorize = loop->dont_vectorize;
@@ -5941,7 +5941,14 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
}
if (lhs)
+ {
+ //TODO Earlier check for ICE as reported in <http://mid.mail-archive.com/878s9zgir3.fsf@euler.schwinge.homeip.net>.
+ //TODO Not sure if this makes too much sense to have (just) here -- should probably be moved (way) further up in the pipeline?
+ if (TREE_CODE (TREE_TYPE (lhs)) == REFERENCE_TYPE)
+ gcc_checking_assert (is_gimple_addressable (var));
+
gimplify_assign (lhs, var, &seq);
+ }
pop_gimplify_context (NULL);
gsi_replace_with_seq (&gsi, seq, true);
@@ -14434,14 +14434,22 @@ Maximum depth of logical expression evaluation ranger will look through
when evaluating outgoing edge ranges.
@item openacc-kernels
-Specify mode of OpenACC `kernels' constructs handling.
-With @option{--param=openacc-kernels=decompose}, OpenACC `kernels'
+Specify mode of OpenACC `kernels' constructs handling. With
+@option{--param=openacc-kernels=decompose}, OpenACC `kernels'
constructs are decomposed into parts, a sequence of compute
-constructs, each then handled individually.
-This is work in progress.
+constructs, each then handled individually. The data dependence
+analysis that is necessary to determine if loops can be parallelized
+is performed by the Graphite pass.
+This is the default.
+With @option{--param=openacc-kernels=decompose-parloops}, OpenACC
+`kernels' constructs are decomposed into parts, a sequence of compute
+constructs, each then handled individually by the @samp{parloops}
+pass.
+This is deprecated.
With @option{--param=openacc-kernels=parloops}, OpenACC `kernels'
-constructs are handled by the @samp{parloops} pass, en bloc.
-This is the current default.
+constructs are handled by the @samp{parloops} pass, en bloc. This is
+deprecated.
+This is deprecated.
@end table
@@ -248,9 +248,9 @@ constraints in order to generate the points-to sets. It is located in
This is a pass group for processing OpenACC kernels regions. It is a
subpass of the IPA OpenACC pass group that runs on offloaded functions
-containing OpenACC kernels loops. It is located in
-@file{tree-ssa-loop.c} and is described by
-@code{pass_ipa_oacc_kernels}.
+containing OpenACC kernels loops if @samp{parloops} based handling of
+kernels regions is used. It is located in @file{tree-ssa-loop.c} and
+is described by @code{pass_ipa_oacc_kernels}.
@item Target clone
@@ -424,6 +424,7 @@ enum evrp_mode
enum openacc_kernels
{
OPENACC_KERNELS_DECOMPOSE,
+ OPENACC_KERNELS_DECOMPOSE_PARLOOPS,
OPENACC_KERNELS_PARLOOPS
};
@@ -1769,6 +1769,9 @@ dump_gimple_omp_target (pretty_printer *buffer, const gomp_target *gs,
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
kind = " oacc_parallel_kernels_gang_single";
break;
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+ kind = " oacc_parallel_kernels_graphite";
+ break;
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
kind = " oacc_data_kernels";
break;
@@ -161,7 +161,7 @@ enum gf_mask {
GF_OMP_FOR_KIND_SIMD = 5,
GF_OMP_FOR_COMBINED = 1 << 3,
GF_OMP_FOR_COMBINED_INTO = 1 << 4,
- GF_OMP_TARGET_KIND_MASK = (1 << 4) - 1,
+ GF_OMP_TARGET_KIND_MASK = (1 << 5) - 1,
GF_OMP_TARGET_KIND_REGION = 0,
GF_OMP_TARGET_KIND_DATA = 1,
GF_OMP_TARGET_KIND_UPDATE = 2,
@@ -184,6 +184,9 @@ enum gf_mask {
/* A 'GF_OMP_TARGET_KIND_OACC_DATA' representing an OpenACC 'kernels'
decomposed parts' 'data' construct. */
GF_OMP_TARGET_KIND_OACC_DATA_KERNELS = 15,
+ /* A GF_OMP_TARGET_KIND_OACC_PARALLEL that originates from a 'kernels'
+ construct, for Graphite to analyze. */
+ GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE = 16,
GF_OMP_TEAMS_HOST = 1 << 0,
/* True on an GIMPLE_OMP_RETURN statement if the return does not require
@@ -6619,6 +6622,7 @@ is_gimple_omp_oacc (const gimple *stmt)
case GF_OMP_TARGET_KIND_OACC_DECLARE:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
return true;
@@ -6648,6 +6652,7 @@ is_gimple_omp_offloaded (const gimple *stmt)
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
return true;
default:
return false;
@@ -12934,11 +12934,9 @@ gimplify_omp_for (tree *expr_p, gimple_seq *pre_p)
&& outer->region_type != ORT_ACC_KERNELS)
outer = outer->outer_context;
- /* FIXME: Reductions only work in parallel regions at present. We avoid
- doing the reduction localization transformation in kernels regions
- here, because the code to remove reductions in kernels regions cannot
- handle that. */
- if (outer && outer->region_type == ORT_ACC_PARALLEL)
+ if (outer && (outer->region_type == ORT_ACC_PARALLEL
+ || (outer->region_type == ORT_ACC_KERNELS
+ && param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE)))
localize_reductions (OMP_FOR_CLAUSES (for_stmt),
OMP_FOR_BODY (for_stmt));
}
@@ -14472,8 +14470,9 @@ gimplify_omp_workshare (tree *expr_p, gimple_seq *pre_p)
{
push_gimplify_context ();
- /* FIXME: Reductions are not supported in kernels regions yet. */
- if (/*ort == ORT_ACC_KERNELS ||*/ ort == ORT_ACC_PARALLEL)
+ if (ort == ORT_ACC_PARALLEL
+ || (ort == ORT_ACC_KERNELS
+ && param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE))
localize_reductions (OMP_CLAUSES (expr), OMP_BODY (expr));
gimple *g = gimplify_and_return_first (OMP_BODY (expr), &body);
@@ -38,6 +38,9 @@ along with GCC; see the file COPYING3. If not see
#include "cfgloop.h"
#include "tree-data-ref.h"
#include "graphite.h"
+#include "graphite-oacc.h"
+#include "gimple-pretty-print.h"
+
/* Add the constraints from the set S to the domain of MAP. */
@@ -63,71 +66,108 @@ add_pdr_constraints (poly_dr_p pdr, poly_bb_p pbb)
return constrain_domain (x, isl_set_copy (pbb->domain));
}
-/* Returns an isl description of all memory operations in SCOP. The memory
- reads are returned in READS and writes in MUST_WRITES and MAY_WRITES. */
+/* Returns an isl description of all memory operations in SCOP. The
+ memory reads are returned in READS and writes in MUST_WRITES and
+ MAY_WRITES, kills go to KILLS. */
static void
scop_get_reads_and_writes (scop_p scop, isl_union_map *&reads,
isl_union_map *&must_writes,
- isl_union_map *&may_writes)
+ isl_union_map *&may_writes,
+ isl_union_map *&kills)
{
int i, j;
poly_bb_p pbb;
poly_dr_p pdr;
FOR_EACH_VEC_ELT (scop->pbbs, i, pbb)
+ {
+ FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr)
{
- FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr) {
- if (pdr_read_p (pdr))
- {
- if (dump_file)
- {
- fprintf (dump_file, "Adding read to depedence graph: ");
- print_pdr (dump_file, pdr);
- }
- isl_union_map *um
- = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
- reads = isl_union_map_union (reads, um);
- if (dump_file)
- {
- fprintf (dump_file, "Reads depedence graph: ");
- print_isl_union_map (dump_file, reads);
- }
- }
- else if (pdr_write_p (pdr))
- {
- if (dump_file)
- {
- fprintf (dump_file, "Adding must write to depedence graph: ");
- print_pdr (dump_file, pdr);
- }
- isl_union_map *um
- = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
- must_writes = isl_union_map_union (must_writes, um);
- if (dump_file)
- {
- fprintf (dump_file, "Must writes depedence graph: ");
- print_isl_union_map (dump_file, must_writes);
- }
- }
- else if (pdr_may_write_p (pdr))
- {
- if (dump_file)
- {
- fprintf (dump_file, "Adding may write to depedence graph: ");
- print_pdr (dump_file, pdr);
- }
- isl_union_map *um
- = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
- may_writes = isl_union_map_union (may_writes, um);
- if (dump_file)
- {
- fprintf (dump_file, "May writes depedence graph: ");
- print_isl_union_map (dump_file, may_writes);
- }
- }
- }
+ isl_union_map *um = NULL;
+
+ if (pdr->is_reduction)
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "Skipped reduction variable %s in statement .\n",
+ pdr_write_p (pdr) ? "read" : "write");
+ print_gimple_stmt (dump_file, pdr->stmt, 0, dump_flags);
+ fprintf (dump_file, "\n");
+ }
+ continue;
+ }
+
+ if (pdr_read_p (pdr))
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "Adding %sread to dependence graph: ",
+ pdr->is_reduction ? "reduction " : "");
+ print_pdr (dump_file, pdr);
+ isl_map* tmp = add_pdr_constraints (pdr, pbb);
+ print_isl_map (dump_file, tmp);
+ isl_map_free (tmp);
+ }
+ um = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
+
+ reads = isl_union_map_union (reads, um);
+ if (dump_file)
+ {
+ fprintf (dump_file, "Reads dependence graph: ");
+ print_isl_union_map (dump_file, reads);
+ }
+ }
+ else if (pdr_write_p (pdr))
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "Adding %smust write to dependence graph: ",
+ pdr->is_reduction ? "reduction " : "");
+ print_pdr (dump_file, pdr);
+ }
+
+
+ um = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
+
+ must_writes = isl_union_map_union (must_writes, um);
+ }
+ else if (pdr_may_write_p (pdr))
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "Adding %smay write to dependence graph: ",
+ pdr->is_reduction ? "reduction " : "");
+ print_pdr (dump_file, pdr);
+ }
+ um = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
+
+ may_writes = isl_union_map_union (may_writes, um);
+ if (dump_file)
+ {
+ fprintf (dump_file, "May writes dependence graph: ");
+ print_isl_union_map (dump_file, may_writes);
+ }
+ }
+ else if (pdr_kill_p (pdr))
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "Adding kill to dependence graph: ");
+ print_pdr (dump_file, pdr);
+ }
+ um = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
+
+ kills = isl_union_map_union (kills, um);
+ if (dump_file)
+ {
+ fprintf (dump_file, "Kills: ");
+ print_isl_union_map (dump_file, kills);
+ }
+ }
}
+ }
}
/* Helper function used on each MAP of a isl_union_map. Computes the
@@ -203,7 +243,19 @@ apply_schedule_on_deps (__isl_keep isl_union_map *schedule,
isl_union_map *trans = extend_schedule (isl_union_map_copy (schedule));
isl_union_map *ux = isl_union_map_copy (deps);
ux = isl_union_map_apply_domain (ux, isl_union_map_copy (trans));
+ if (dump_file && dump_flags & TDF_DETAILS)
+ {
+ fprintf (dump_file, "Applied domain map to dependences:\n");
+ print_isl_union_map (dump_file, ux);
+ }
ux = isl_union_map_apply_range (ux, trans);
+
+ if (dump_file && dump_flags & TDF_DETAILS)
+ {
+ fprintf (dump_file, "Applied range map:\n");
+ print_isl_union_map (dump_file, ux);
+ }
+
ux = isl_union_map_coalesce (ux);
if (!isl_union_map_is_empty (ux))
@@ -230,6 +282,12 @@ carries_deps (__isl_keep isl_union_map *schedule,
if (x == NULL)
return false;
+ if (dump_file && dump_flags & TDF_DETAILS)
+ {
+ fprintf (dump_file, "Applied schedule on dependences:\n");
+ print_isl_map (dump_file, x);
+ }
+
isl_space *space = isl_map_get_space (x);
isl_map *lex = isl_map_lex_le (isl_space_range (space));
isl_constraint *ineq = isl_inequality_alloc
@@ -244,7 +302,22 @@ carries_deps (__isl_keep isl_union_map *schedule,
ineq = isl_constraint_set_constant_si (ineq, -1);
lex = isl_map_add_constraint (lex, ineq);
lex = isl_map_coalesce (lex);
+
+
+ if (dump_file && dump_flags & TDF_DETAILS)
+ {
+ fprintf (dump_file, "Lex: \n");
+ print_isl_map (dump_file, lex);
+ }
+
x = isl_map_intersect (x, lex);
+
+ if (dump_file && dump_flags & TDF_DETAILS)
+ {
+ fprintf (dump_file, "Intersect: \n");
+ print_isl_map (dump_file, x);
+ }
+
bool res = !isl_map_is_empty (x);
isl_map_free (x);
@@ -265,8 +338,9 @@ scop_get_dependences (scop_p scop)
isl_space *space = isl_set_get_space (scop->param_context);
isl_union_map *reads = isl_union_map_empty (isl_space_copy (space));
isl_union_map *must_writes = isl_union_map_empty (isl_space_copy (space));
- isl_union_map *may_writes = isl_union_map_empty (space);
- scop_get_reads_and_writes (scop, reads, must_writes, may_writes);
+ isl_union_map *may_writes = isl_union_map_empty (isl_space_copy (space));
+ isl_union_map *kills = isl_union_map_empty (space);
+ scop_get_reads_and_writes (scop, reads, must_writes, may_writes, kills);
if (dump_file)
{
@@ -282,10 +356,11 @@ scop_get_dependences (scop_p scop)
fprintf (dump_file, " [1, i0] is a 'memref' with alias set 1"
" and first subscript access i0.\n");
fprintf (dump_file, " [106] is a 'scalar reference' which is the sum of"
- " SSA_NAME_VERSION 6"
- " and --param graphite-max-arrays-per-scop=100\n");
+ " SSA_NAME_VERSION 6 and scop->max_alias_set whose value\n is 100"
+ " in this example.\n");
fprintf (dump_file, "-----------------------\n\n");
+ fprintf (dump_file, "max_alias_set: %d\n", scop->max_alias_set);
fprintf (dump_file, "data references (\n");
fprintf (dump_file, " reads: ");
print_isl_union_map (dump_file, reads);
@@ -293,31 +368,59 @@ scop_get_dependences (scop_p scop)
print_isl_union_map (dump_file, must_writes);
fprintf (dump_file, " may_writes: ");
print_isl_union_map (dump_file, may_writes);
+ fprintf (dump_file, " kills: ");
+ print_isl_union_map (dump_file, kills);
fprintf (dump_file, ")\n");
}
gcc_assert (scop->original_schedule);
+
isl_union_access_info *ai;
ai = isl_union_access_info_from_sink (isl_union_map_copy (reads));
ai = isl_union_access_info_set_must_source (ai, isl_union_map_copy (must_writes));
ai = isl_union_access_info_set_may_source (ai, may_writes);
+ ai = isl_union_access_info_set_kill (ai, isl_union_map_copy (kills));
ai = isl_union_access_info_set_schedule
(ai, isl_schedule_copy (scop->original_schedule));
isl_union_flow *flow = isl_union_access_info_compute_flow (ai);
isl_union_map *raw = isl_union_flow_get_must_dependence (flow);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "raw dependences (\n");
+ print_isl_union_map (dump_file, raw);
+ fprintf (dump_file, ")\n");
+ }
+
isl_union_flow_free (flow);
ai = isl_union_access_info_from_sink (isl_union_map_copy (must_writes));
ai = isl_union_access_info_set_must_source (ai, must_writes);
ai = isl_union_access_info_set_may_source (ai, reads);
+ ai = isl_union_access_info_set_kill (ai, kills);
ai = isl_union_access_info_set_schedule
(ai, isl_schedule_copy (scop->original_schedule));
flow = isl_union_access_info_compute_flow (ai);
isl_union_map *waw = isl_union_flow_get_must_dependence (flow);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "waw dependences (\n");
+ print_isl_union_map (dump_file, waw);
+ fprintf (dump_file, ")\n");
+ }
isl_union_map *war = isl_union_flow_get_may_dependence (flow);
war = isl_union_map_subtract (war, isl_union_map_copy (waw));
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "war dependences (\n");
+ print_isl_union_map (dump_file, war);
+ fprintf (dump_file, ")\n");
+ }
+
isl_union_flow_free (flow);
raw = isl_union_map_coalesce (raw);
@@ -331,6 +434,9 @@ scop_get_dependences (scop_p scop)
if (dump_file)
{
+ fprintf (dump_file, "(space: " );
+ print_isl_space (dump_file, space);
+ fprintf (dump_file, ")\n");
fprintf (dump_file, "data dependences (\n");
print_isl_union_map (dump_file, dependences);
fprintf (dump_file, ")\n");
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3. If not see
#include "tree-ssa.h"
#include "tree-vectorizer.h"
#include "graphite.h"
+#include "graphite-oacc.h"
+#include "stdlib.h"
struct ast_build_info
{
@@ -1456,8 +1458,8 @@ generate_entry_out_of_ssa_copies (edge false_entry,
}
}
-/* Create a condition that evaluates to TRUE if all ALIAS_DDRS are free of
- aliasing. */
+/* Create a condition that evaluates to TRUE if all ALIAS_DDRS
+ are free of aliasing. */
static tree
generate_alias_cond (vec<ddr_p> &alias_ddrs, loop_p context_loop)
@@ -1618,4 +1620,91 @@ graphite_regenerate_ast_isl (scop_p scop)
return !t.codegen_error_p ();
}
+/* A callback for traversing a schedule tree that visits the band
+ nodes of a schedule which correspond to loops. Checks if the local
+ schedule carries any dependencies and marks the corresponding CFG
+ loops as being parallelizable accordingly. */
+
+static isl_bool
+visit_schedule_loop_node (__isl_keep isl_schedule_node *node, void *user)
+{
+ isl_bool visit_children = isl_bool_true;
+
+ if (isl_schedule_node_get_type (node) != isl_schedule_node_band)
+ return visit_children;
+
+ isl_union_map *dependences = (isl_union_map *)user;
+ isl_union_map *schedule
+ = isl_schedule_node_band_get_partial_schedule_union_map (node);
+ isl_space *space = isl_schedule_node_band_get_space (node);
+
+ isl_id *id = isl_space_get_tuple_id (space, isl_dim_out);
+ const char *name = isl_id_get_name (id);
+ /* Expect format set by add_loop_schedule, i.e. "L_n" */
+ gcc_checking_assert (name[0] == 'L' && name[1] == '_');
+ int loop_num = atoi (name + 2);
+ isl_id_free (id);
+
+ int dimension = isl_space_dim (space, isl_dim_out);
+ loop_p loop = get_loop (cfun, loop_num);
+
+ if (dump_file && dump_flags & TDF_DETAILS)
+ {
+ fprintf (dump_file, "CFG loop %d:\n", loop_num);
+ print_isl_union_map (dump_file, schedule);
+ fprintf (dump_file, "Schedule dimension: %d\n", dimension);
+
+ fprintf (dump_file, "Schedule node space:\n");
+ print_isl_space (dump_file, space);
+ fprintf (dump_file, "data dependences (\n");
+ print_isl_union_map (dump_file, dependences);
+ fprintf (dump_file, ")\n");
+ }
+
+ bool has_deps = carries_deps (schedule, dependences, dimension);
+
+ loop->can_be_parallel = !has_deps;
+ loop->can_be_parallel_valid_p = true;
+
+ if (dump_file && dump_flags & TDF_DETAILS)
+ {
+ dump_user_location_t loc = find_loop_location (loop);
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
+ "loop %s data-dependences.\n",
+ has_deps ? "has" : "has no");
+
+ fprintf (dump_file, ")\n");
+ }
+
+ isl_union_map_free (schedule);
+ isl_space_free (space);
+
+
+ return visit_children;
+}
+
+/* This function performs data-dependence analysis on the SCoP without using
+ Graphite's code generation. This is meant for OpenACC use since the code
+ generator is unable to reconstruct the OpenACC loop structure. */
+
+bool
+graphite_oacc_analyze_scop (scop_p scop)
+{
+ timevar_push (TV_GRAPHITE_CODE_GEN);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "[graphite_oacc_analyze_scop] schedule:\n");
+ print_isl_schedule (dump_file, scop->original_schedule);
+ }
+
+ /* Analyze dependences in SCoP and mark loops as parallelizable accordingly. */
+ isl_schedule_foreach_schedule_node_top_down (
+ scop->original_schedule, visit_schedule_loop_node, scop->dependence);
+
+ timevar_pop (TV_GRAPHITE_CODE_GEN);
+
+ return true;
+}
+
#endif /* HAVE_isl */
new file mode 100644
@@ -0,0 +1,689 @@
+/* Functions for analyzing the OpenACC loop structure from Graphite.
+
+ Copyright (C) 2021 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "cfghooks.h"
+#include "tree.h"
+#include "gimple.h"
+#include "cfgloop.h"
+
+#include "internal-fn.h"
+#include "gimple.h"
+#include "tree-cfg.h"
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "print-tree.h"
+
+#include "gimple-ssa.h"
+#include "gimple-iterator.h"
+#include "tree-phinodes.h"
+#include "tree-ssa-operands.h"
+#include "ssa-iterators.h"
+#include "omp-general.h"
+#include "graphite-oacc.h"
+
+unsigned
+gimple_call_internal_kind (gimple *call)
+{
+ return TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+}
+
+static bool inline gimple_call_ifn_unique_p (gimple *call,
+ enum ifn_unique_kind kind)
+{
+ if (!gimple_call_internal_p (call, IFN_UNIQUE))
+ return false;
+
+ return kind == gimple_call_internal_kind (call);
+}
+
+static bool inline goacc_reduction_call_p (gimple *call)
+{
+ return gimple_call_internal_p (call, IFN_GOACC_REDUCTION);
+}
+
+static bool inline goacc_reduction_call_p (gimple *call,
+ enum ifn_goacc_reduction_kind kind)
+{
+ return gimple_call_internal_p (call, IFN_GOACC_REDUCTION)
+ && gimple_call_internal_kind (call) == kind;
+}
+
+/* Check if VAR is private in the OpenACC loop that encloses the cfg LOOP. The
+ function returns TRUE if there is an IFN_UNIQUE_OACC_PRIVATE call in the
+ head sequence that precedes the CFG loop. */
+
+bool
+is_oacc_private (tree var, loop_p loop)
+{
+ return false;
+
+ if (TREE_CODE (var) == SSA_NAME)
+ {
+ if (!SSA_NAME_VAR (var))
+ return false;
+
+ var = SSA_NAME_VAR (var);
+ }
+
+ gcc_checking_assert (TREE_CODE (var) == VAR_DECL);
+
+ if (!loop)
+ return false;
+
+ basic_block bb = loop->header;
+ basic_block entry_bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+
+ while (bb != entry_bb)
+ {
+ bb = get_immediate_dominator (CDI_DOMINATORS, bb);
+ gimple *stmt = last_stmt (bb);
+ if (!stmt)
+ continue;
+
+ /* We are looking for the sequence of IFN_UNIQUE calls at the
+ head of the current OpenACC loop. */
+ if (!gimple_call_internal_p (stmt, IFN_UNIQUE))
+ continue;
+
+ enum ifn_unique_kind kind
+ = (enum ifn_unique_kind)TREE_INT_CST_LOW (gimple_call_arg (stmt, 0));
+
+ /* The head mark that starts the current OpenACC loop.
+ Private calls above here are irrelevant. Stop. */
+ if (kind == IFN_UNIQUE_OACC_HEAD_MARK && gimple_call_num_args (stmt) > 2)
+ break;
+
+ if (kind != IFN_UNIQUE_OACC_PRIVATE)
+ continue;
+
+ tree private_var = gimple_call_arg (stmt, 3);
+
+ if (TREE_CODE (private_var) == ADDR_EXPR)
+ private_var = TREE_OPERAND (private_var, 0);
+
+ if (var == private_var)
+ return true;
+ }
+
+ return false;
+}
+
+void
+oacc_add_private_var_kills (loop_p loop, vec<tree> *kills)
+{
+ gcc_checking_assert (loop);
+
+ basic_block bb = loop->header;
+ basic_block entry_bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+
+ while (bb != entry_bb)
+ {
+ bb = get_immediate_dominator (CDI_DOMINATORS, bb);
+
+ gimple *stmt = last_stmt (bb);
+ if (!stmt)
+ continue;
+
+ /* We are looking for the sequence of IFN_UNIQUE calls at the head of the
+ current OpenACC loop. */
+
+ if (!gimple_call_ifn_unique_p (stmt, IFN_UNIQUE_OACC_HEAD_MARK))
+ continue;
+
+ /* The head mark that starts the current OpenACC loop.
+ Private calls above here are irrelevant. Stop. */
+ if (gimple_call_num_args (stmt) > 2)
+ break;
+
+ if (!gimple_call_ifn_unique_p (stmt, IFN_UNIQUE_OACC_PRIVATE))
+ continue;
+
+ tree private_var = gimple_call_arg (stmt, 3);
+
+ gcc_checking_assert (TREE_CODE (private_var) == ADDR_EXPR);
+ private_var = TREE_OPERAND (private_var, 0);
+ kills->safe_push (private_var);
+ }
+}
+
+typedef std::pair<gcall *, gcall *> gcall_pair;
+
+/* Returns a pair that contains the internal function calls that start
+ and end the head sequence of the OpenACC loop enclosing the cfg
+ loop LOOP or a pair of NULL pointers if LOOP is not enclosed in a
+ OpenACC LOOP. */
+
+gcall_pair
+find_oacc_head_marks (loop_p loop)
+{
+ basic_block bb = loop->header;
+ basic_block entry_bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+
+ gcall *top_head_mark = NULL;
+ gcall *bottom_head_mark = NULL;
+
+ while (bb != entry_bb)
+ {
+ bb = get_immediate_dominator (CDI_DOMINATORS, bb);
+
+ gimple *stmt = last_stmt (bb);
+ if (!stmt)
+ continue;
+
+ /* Look for IFN_UNIQUE calls in the head of OpenACC loop. */
+ if (!gimple_call_ifn_unique_p (stmt, IFN_UNIQUE_OACC_HEAD_MARK))
+ continue;
+
+ if (!bottom_head_mark)
+ {
+ bottom_head_mark = as_a<gcall *> (stmt);
+ continue;
+ }
+
+ /* The head mark that starts the current OpenACC loop can be
+ recognized by the number of call arguments, cf. omp-low.c. */
+ if (gimple_call_num_args (stmt) > 3)
+ {
+ top_head_mark = as_a<gcall *> (stmt);
+ break;
+ }
+ }
+
+ gcc_checking_assert ((top_head_mark && bottom_head_mark)
+ || (!top_head_mark && !bottom_head_mark));
+
+ return gcall_pair (top_head_mark, bottom_head_mark);
+}
+
+/* Returns the internal function call that starts the tail sequence of the
+ OpenACC loop that encloses the CFG loop LOOP or NULL if LOOP is not
+ contained in an OpenACC loop. */
+
+gcall *
+find_oacc_top_tail_mark (loop_p loop)
+{
+ gcall_pair head_marks = find_oacc_head_marks (loop);
+
+ if (!head_marks.first || !head_marks.second)
+ return NULL;
+
+ tree data_dep = gimple_call_lhs (head_marks.second);
+ gcc_checking_assert (has_single_use (data_dep));
+
+ gimple *tail_mark;
+ use_operand_p use_p;
+ single_imm_use (data_dep, &use_p, &tail_mark);
+
+ return as_a<gcall *> (tail_mark);
+}
+
+/* Returns a pair containing the internal function calls that start and end the
+ tail sequence of the OpenACC loop that encloses the cfg loop LOOP or a pair
+ of NULL pointers if LOOP does not belong to an OpenACC loop. */
+
+gcall_pair
+find_oacc_tail_marks (loop_p loop)
+{
+ gcall *top_tail_mark = find_oacc_top_tail_mark (loop);
+
+ if (!top_tail_mark)
+ return gcall_pair (NULL, NULL);
+
+ tree data_dep = gimple_call_lhs (top_tail_mark);
+ gimple *stmt = top_tail_mark;
+
+ while (has_single_use (data_dep))
+ {
+ use_operand_p use_p;
+ single_imm_use (data_dep, &use_p, &stmt);
+ data_dep = gimple_call_lhs (stmt);
+
+ gcc_checking_assert (gimple_call_internal_p (stmt));
+ }
+
+ gcall *end_tail_mark = as_a<gcall *> (stmt);
+
+ gcc_checking_assert (
+ gimple_call_ifn_unique_p (end_tail_mark, IFN_UNIQUE_OACC_TAIL_MARK));
+
+ return gcall_pair (top_tail_mark, end_tail_mark);
+}
+
+/* Add all ssa names to VARS that can be reached from PHI by a
+ phi node walk. */
+
+static void
+collect_oacc_reduction_vars_phi_walk (gphi *phi, hash_set<tree> &vars)
+{
+ use_operand_p use_p;
+ ssa_op_iter iter;
+ FOR_EACH_PHI_ARG (use_p, phi, iter, SSA_OP_ALL_USES)
+ {
+ tree use = USE_FROM_PTR (use_p);
+ if (TREE_CODE (use) != SSA_NAME)
+ continue;
+
+ if (vars.contains (use))
+ continue;
+
+ gimple *def_stmt = SSA_NAME_DEF_STMT (use);
+ vars.add (use);
+
+ gphi *use_phi = dyn_cast<gphi *> (def_stmt);
+ if (use_phi)
+ {
+ collect_oacc_reduction_vars_phi_walk (use_phi, vars);
+
+ continue;
+ }
+ }
+}
+
+/* Returns true iff following the immediate use chain from the
+ IFN_GOACC_REDUCTION call CALL leads out of loop that contains CALL. */
+
+static bool
+reduction_use_in_outer_loop_p (gcall *call)
+{
+ gcc_checking_assert (goacc_reduction_call_p (call));
+
+ tree data_dep = gimple_call_lhs (call);
+
+ /* The IFN_GOACC_REDUCTION_CALLS are linked in a chain through
+ immediate uses. Move to the end of this chain. */
+ gimple *stmt = call;
+ while (has_single_use (data_dep))
+ {
+ use_operand_p use_p;
+ single_imm_use (data_dep, &use_p, &stmt);
+
+ if (!goacc_reduction_call_p (stmt))
+ return true;
+
+ data_dep = gimple_call_lhs (stmt);
+ }
+
+ gcc_checking_assert (goacc_reduction_call_p (stmt));
+
+ /* Call starting further reduction use in outer loop. */
+ if (goacc_reduction_call_p (stmt, IFN_GOACC_REDUCTION_SETUP))
+ return true;
+
+ /* Reduction use ends with last internal call in present loop. */
+ if (goacc_reduction_call_p (stmt, IFN_GOACC_REDUCTION_TEARDOWN))
+ return false;
+ gcc_unreachable ();
+}
+
+/* Add all ssa names to VARS that can be reached from BB by walking
+ through the phi nodes which start at the result of an OpenACC
+ reduction computation in BB. */
+
+static void
+collect_oacc_reduction_vars_in_bb (basic_block bb, hash_set<tree> &vars)
+{
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ if (!goacc_reduction_call_p (stmt, IFN_GOACC_REDUCTION_FINI))
+ continue;
+
+ tree var = gimple_call_arg (stmt, 2);
+ gcc_checking_assert (TREE_CODE (var) == SSA_NAME);
+
+ if (vars.contains (var))
+ continue;
+
+ gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+ if (gimple_code (def_stmt) != GIMPLE_PHI)
+ {
+ gcc_checking_assert (goacc_reduction_call_p (def_stmt));
+
+ continue;
+ }
+
+ gcc_checking_assert (
+ goacc_reduction_call_p (stmt, IFN_GOACC_REDUCTION_FINI));
+ gcc_checking_assert (gimple_code (def_stmt) == GIMPLE_PHI);
+
+ if (reduction_use_in_outer_loop_p (as_a<gcall *> (stmt)))
+ vars.add (var);
+
+ collect_oacc_reduction_vars_phi_walk (static_cast<gphi *> (def_stmt),
+ vars);
+ }
+}
+
+/* Add all ssa names to VARS that are defined by phi nodes in the header of LOOP
+ such that at least one argument of the phi belongs to VARS. */
+
+static void
+collect_oacc_reduction_vars_in_loop_header (loop_p loop, hash_set<tree> &vars)
+{
+ for (gphi_iterator gpi = gsi_start_phis (loop->header); !gsi_end_p (gpi);
+ gsi_next (&gpi))
+ {
+ gphi *phi = const_cast<gphi *> (gpi.phi ());
+
+ use_operand_p use_p;
+ ssa_op_iter iter;
+ FOR_EACH_PHI_ARG (use_p, phi, iter, SSA_OP_ALL_USES)
+ {
+ tree use = USE_FROM_PTR (use_p);
+ if (vars.contains (use))
+ vars.add (gimple_phi_result (phi));
+ }
+ }
+}
+
+/* Find the ssa names that belong to an OpenACC reduction in the OpenACC loop
+ that surrounds the cfg loop LOOP and add them to VARS. LOOP must be
+ contained in an OpenACC loop.
+
+ Since the reductions have not and cannot be lowered before execution of the
+ Graphite pass because their lowering is device dependent, Graphite needs to
+ simulate the privatization of the reduction variables by removing
+ dependences between the iteration instances of the loop and the dependences
+ arising from copying the initial value of the reduction variable in and the
+ result out.
+
+ The OpenACC lowering will copy the results of reduction computations at the
+ IFN_GOACC_REDUCTION_FINI calls. The main reduction statement can thus be
+ identified by walking from those calls through all encountered phi nodes
+ until we reach a gimple assignment statement. The ssa name defined by this
+ statement as well as the ssa_names encountered in the phis along the way are
+ recorded in VARS. In addition, the ssa name defined by each phi which uses a
+ previously identified reduction variable in LOOP's header will also be added
+ to VARS. */
+
+void
+collect_oacc_reduction_vars (loop_p loop, hash_set<tree> &vars)
+{
+ gcall_pair tail = find_oacc_tail_marks (loop);
+ bool in_openacc_loop = tail.first != NULL;
+
+ if (!in_openacc_loop)
+ return;
+
+ const gcall *top_mark = tail.first;
+ const gcall *bottom_mark = tail.second;
+
+ basic_block bb = top_mark->bb;
+ gcc_checking_assert (single_succ_p (bb));
+
+ do
+ {
+ bb = single_succ (bb);
+ collect_oacc_reduction_vars_in_bb (bb, vars);
+ }
+ while (bb != bottom_mark->bb && single_succ_p (bb));
+
+ collect_oacc_reduction_vars_in_loop_header (loop, vars);
+}
+
+static void collect_oacc_privatized_vars_phi_walk_visit_phi_uses (
+ tree var, hash_set<tree> &vars, hash_set<tree> &visited);
+
+/* Add all ssa names to VARS that can be reached from PHI by a phi node walk. */
+
+static void
+collect_oacc_privatized_vars_phi_walk (gphi *phi, hash_set<tree> &vars,
+ hash_set<tree> &visited)
+{
+ tree var = PHI_RESULT (phi);
+ bool existed = vars.add (var);
+ if (existed)
+ return;
+
+ use_operand_p use_p;
+ ssa_op_iter iter;
+ FOR_EACH_PHI_ARG (use_p, phi, iter, SSA_OP_ALL_USES)
+ {
+ tree use = USE_FROM_PTR (use_p);
+ if (TREE_CODE (use) != SSA_NAME)
+ continue;
+
+ if (visited.contains (use))
+ continue;
+
+ gimple *def_stmt = SSA_NAME_DEF_STMT (use);
+ gphi *use_phi = dyn_cast<gphi *> (def_stmt);
+ if (use_phi)
+ {
+ collect_oacc_privatized_vars_phi_walk (use_phi, vars, visited);
+ visited.add (use);
+ continue;
+ }
+
+ vars.add (use);
+
+ /* Visit the uses of USE in other phi nodes. This is used to get from loop
+ exit phis in inner loops to the loop entry phis. */
+
+ collect_oacc_privatized_vars_phi_walk_visit_phi_uses (use, vars, visited);
+ visited.add (use);
+ }
+}
+
+/* Records all uses of VAR in phis in VARS and continues the phi walk on each
+ such use. */
+
+static void
+collect_oacc_privatized_vars_phi_walk_visit_phi_uses (tree var,
+ hash_set<tree> &vars,
+ hash_set<tree> &visited)
+{
+ imm_use_iterator iter;
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_FAST (use_p, iter, var)
+ {
+ tree use = USE_FROM_PTR (use_p);
+ if (TREE_CODE (use) != SSA_NAME)
+ continue;
+
+ if (visited.contains (use))
+ continue;
+
+ gimple *use_stmt = USE_STMT (use_p);
+ gphi *use_phi = dyn_cast<gphi *> (use_stmt);
+
+ if (use_phi)
+ {
+ visited.add (PHI_RESULT (use_phi));
+ collect_oacc_privatized_vars_phi_walk (use_phi, vars, visited);
+ continue;
+ }
+
+ if (TREE_CODE (use) == SSA_NAME
+ && SSA_NAME_VAR (use) == SSA_NAME_VAR (var))
+ {
+ if (!vars.add (use))
+ collect_oacc_privatized_vars_phi_walk_visit_phi_uses (use, vars,
+ visited);
+ continue;
+ }
+ }
+
+ return;
+}
+
+/* Return the first IFN_UNIQUE call with the given KIND that follows the tail
+ sequence of the OpenACC loop surrounding LOOP. */
+
+static gcall *
+find_ifn_unique_call_below (loop_p loop, enum ifn_unique_kind kind)
+{
+ gcall_pair tail = find_oacc_tail_marks (loop);
+ bool in_openacc_loop = tail.first != NULL;
+
+ if (!in_openacc_loop)
+ return NULL;
+
+ edge exit = single_exit (loop);
+ basic_block bb = exit->dest;
+ while ((bb = get_immediate_dominator (CDI_POST_DOMINATORS, bb)))
+ {
+ gimple *stmt = last_stmt (bb);
+
+ if (!stmt)
+ continue;
+
+ if (gimple_call_ifn_unique_p (stmt, kind))
+ return static_cast<gcall *> (stmt);
+ }
+
+ return NULL;
+}
+
+/* Return the IFN_UNIQUE_OACC_PRIVATE_SCALAR call which follows the tail
+ sequence of the OpenACC loop surrounding LOOP. */
+
+gcall *
+get_oacc_private_scalars_call (loop_p loop)
+{
+ return find_ifn_unique_call_below (loop, IFN_UNIQUE_OACC_PRIVATE_SCALAR);
+}
+
+/* Return the IFN_UNIQUE_OACC_FIRSTPRIVATE call which follows the tail
+ sequence of the OpenACC loop surrounding LOOP. */
+
+gcall *
+get_oacc_firstprivate_call (loop_p loop)
+{
+ return find_ifn_unique_call_below (loop, IFN_UNIQUE_OACC_FIRSTPRIVATE);
+}
+
+/* Find the ssa names that belong to the computation of variables that are
+ "private" in the OpenACC loop that surrounds the CFG loop LOOP and add them
+ to VARS. LOOP must be contained in an OpenACC loop.
+
+ The CFG loop structure of OpenACC loops does not directly reflect the
+ privatization of the variable since the original loop has been enclosed in a
+ "chunking" loop. The "private" scalars variables are alive in those two
+ outermost CFG loops and the corresponding phis must be ignored by Graphite in
+ order to recognize the parallelizability of the loop. Omp-low.c places a
+ special internal function call after the outermost loop of a parallel region
+ whose arguments list the "private" variables that are considered here */
+
+void
+collect_oacc_privatized_vars (gcall *marker, hash_set<tree> &vars)
+{
+ if (!marker)
+ return;
+
+ gcc_checking_assert (marker->bb->loop_father->num == 0);
+
+ /* Search for phis that can be reached from the vars listed in the
+ PRIVATE_SCALARS_CALL's arguments. */
+
+ const unsigned n = gimple_call_num_args (marker);
+ for (unsigned i = 1; i < n; ++i)
+ {
+ tree arg = gimple_call_arg (marker, i);
+
+ if (TREE_CODE (arg) != SSA_NAME)
+ continue;
+
+ gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
+ gphi *phi = dyn_cast<gphi *> (def_stmt);
+ if (!phi)
+ {
+ /* If the argument does not point to a phi, then it must be some value
+ defined outside of any OpenACC loop nest, i.e. a parameter of the
+ loop-nest. */
+ gcc_checking_assert (!def_stmt->bb
+ || def_stmt->bb->loop_father->num == 0);
+ continue;
+ }
+
+ hash_set<tree> visited;
+ collect_oacc_privatized_vars_phi_walk (phi, vars, visited);
+ }
+}
+
+/* Return true if LOOP is an OpenACC loop with an "auto" clause, false otherwise. */
+
+static bool
+oacc_loop_with_auto_clause_p (loop_p loop)
+{
+ gcall_pair head_marks = find_oacc_head_marks (loop);
+
+ if (!head_marks.first)
+ return false;
+
+ unsigned flags = TREE_INT_CST_LOW (gimple_call_arg (head_marks.first, 3));
+ return flags & OLF_AUTO;
+}
+
+/* Return true if FUN is an outlined OpenACC function that contains loops with
+ "auto" clauses. */
+
+static bool
+function_has_auto_loops_p (function *fun)
+{
+ gcc_checking_assert (oacc_function_p (fun));
+
+ loop_p loop;
+ FOR_EACH_LOOP_FN (fun, loop, 0)
+ if (oacc_loop_with_auto_clause_p (loop))
+ return true;
+
+ return false;
+}
+
+/* Return true if Graphite might analyze outlined OpenACC functions for the kind
+ of target region for which FUN was created. The actual decision whether
+ Graphite runs on FUN may be subject to further restrictions. */
+
+bool
+graphite_analyze_oacc_target_region_type_p (function *fun)
+{
+ gcc_checking_assert (oacc_function_p (fun));
+
+ bool is_oacc_parallel
+ = lookup_attribute ("oacc parallel",
+ DECL_ATTRIBUTES (current_function_decl))
+ != NULL;
+
+ bool is_oacc_parallel_kernels_graphite
+ = lookup_attribute ("oacc parallel_kernels_graphite",
+ DECL_ATTRIBUTES (current_function_decl))
+ != NULL;
+
+ return is_oacc_parallel || is_oacc_parallel_kernels_graphite;
+}
+
+/* Return true if FUN is an outlined OpenACC function that is going to be
+ analyzed by Graphite. */
+
+bool
+graphite_analyze_oacc_function_p (function *fun)
+{
+ gcc_checking_assert (oacc_function_p (fun));
+
+ return graphite_analyze_oacc_target_region_type_p (cfun)
+ && function_has_auto_loops_p (cfun);
+}
new file mode 100644
@@ -0,0 +1,55 @@
+/* Functions for analyzing the OpenACC loop structure from Graphite.
+
+ Copyright (C) 2021 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#ifndef GCC_GRAPHITE_OACC_H
+#define GCC_GRAPHITE_OACC_H
+
+#include "stringpool.h"
+#include "omp-general.h"
+#include "attribs.h"
+#include "cfgloop.h"
+#include "tree-pretty-print.h"
+#include "print-tree.h"
+
+static inline bool oacc_function_p (function *fun)
+{
+ return oacc_get_fn_attrib (fun->decl);
+}
+
+extern bool is_oacc_private (tree var, loop_p loop);
+extern void oacc_add_private_var_kills (loop_p loop, vec<tree> *kills);
+
+extern const gcall* find_oacc_head_mark (loop_p loop, bool last = false);
+
+extern void collect_oacc_reduction_vars (loop_p loop, hash_set<tree> &vars);
+extern void collect_oacc_firstprivate_vars (loop_p loop, hash_set<tree> &vars);
+extern void collect_oacc_private_scalars (loop_p loop, hash_set<tree> &vars);
+extern void collect_oacc_privatized_vars (gcall *marker, hash_set<tree> &vars);
+
+extern gcall* get_oacc_firstprivate_call (loop_p loop);
+extern gcall* get_oacc_private_scalars_call (loop_p loop);
+
+extern bool graphite_analyze_oacc_function_p (function *fun);
+extern bool graphite_analyze_oacc_target_region_type_p (function *fun);
+
+extern gcall* get_oacc_firstprivate_call (loop_p loop);
+extern gcall* get_oacc_private_scalars_call (loop_p loop);
+
+#endif /* GCC_GRAPHITE_OACC_H */
@@ -109,8 +109,8 @@ scop_get_domains (scop_p scop)
/* Compute the schedule for SCOP based on its parameters, domain and set of
constraints. Then apply the schedule to SCOP. */
-static bool
-optimize_isl (scop_p scop)
+bool
+optimize_isl (scop_p scop, bool oacc_enabled_graphite)
{
int old_err = isl_options_get_on_error (scop->isl_context);
int old_max_operations = isl_ctx_get_max_operations (scop->isl_context);
@@ -196,7 +196,8 @@ optimize_isl (scop_p scop)
print_schedule_ast (dump_file, scop->original_schedule, scop);
isl_schedule_free (scop->transformed_schedule);
scop->transformed_schedule = isl_schedule_copy (scop->original_schedule);
- return flag_graphite_identity || flag_loop_parallelize_all;
+ return flag_graphite_identity || flag_loop_parallelize_all
+ || oacc_enabled_graphite;
}
return true;
@@ -92,7 +92,8 @@ debug_iteration_domains (scop_p scop)
void
new_poly_dr (poly_bb_p pbb, gimple *stmt, enum poly_dr_type type,
- isl_map *acc, isl_set *subscript_sizes)
+ isl_map *acc, isl_set *subscript_sizes,
+ bool is_reduction)
{
static int id = 0;
poly_dr_p pdr = XNEW (struct poly_dr);
@@ -105,10 +106,12 @@ new_poly_dr (poly_bb_p pbb, gimple *stmt, enum poly_dr_type type,
pdr->subscript_sizes = subscript_sizes;
PDR_TYPE (pdr) = type;
PBB_DRS (pbb).safe_push (pdr);
+ pdr->is_reduction = is_reduction;
if (dump_file)
{
- fprintf (dump_file, "Converting dr: ");
+ fprintf (dump_file, "Converting%sdr: ",
+ is_reduction ? " reduction " : " ");
print_pdr (dump_file, pdr);
fprintf (dump_file, "To polyhedral representation:\n");
fprintf (dump_file, " - access functions: ");
@@ -187,6 +190,10 @@ print_pdr (FILE *file, poly_dr_p pdr)
fprintf (file, "may_write \n");
break;
+ case PDR_KILL:
+ fprintf (file, "kill \n");
+ break;
+
default:
gcc_unreachable ();
}
@@ -212,13 +219,15 @@ debug_pdr (poly_dr_p pdr)
gimple_poly_bb_p
new_gimple_poly_bb (basic_block bb, vec<data_reference_p> drs,
- vec<scalar_use> reads, vec<tree> writes)
+ vec<scalar_use> reads, vec<tree> writes,
+ vec<tree> kills)
{
gimple_poly_bb_p gbb = XNEW (struct gimple_poly_bb);
GBB_BB (gbb) = bb;
GBB_DATA_REFS (gbb) = drs;
gbb->read_scalar_refs = reads;
gbb->write_scalar_refs = writes;
+ gbb->kill_scalar_refs = kills;
GBB_CONDITIONS (gbb).create (0);
GBB_CONDITION_CASES (gbb).create (0);
@@ -235,6 +244,7 @@ free_gimple_poly_bb (gimple_poly_bb_p gbb)
GBB_CONDITION_CASES (gbb).release ();
gbb->read_scalar_refs.release ();
gbb->write_scalar_refs.release ();
+ gbb->kill_scalar_refs.release ();
XDELETE (gbb);
}
@@ -264,6 +274,9 @@ new_scop (edge entry, edge exit)
scop_set_region (s, region);
s->pbbs.create (3);
s->drs.create (3);
+ s->reduction_vars = new hash_set<tree>(1);
+ s->oacc_firstprivate_vars = new hash_set<tree>(1);
+ s->oacc_private_scalars = new hash_set<tree>(1);
s->unhandled_alias_ddrs.create (1);
s->dependence = NULL;
return s;
@@ -285,6 +298,9 @@ free_scop (scop_p scop)
scop->pbbs.release ();
scop->drs.release ();
+ delete scop->reduction_vars;
+ delete scop->oacc_firstprivate_vars;
+ delete scop->oacc_private_scalars;
scop->unhandled_alias_ddrs.release ();
isl_set_free (scop->param_context);
@@ -550,6 +566,23 @@ debug_isl_map (__isl_keep isl_map *map)
print_isl_map (stderr, map);
}
+
+void
+print_isl_space (FILE *f, __isl_keep isl_space *space)
+{
+ isl_printer *p = isl_printer_to_file (the_isl_ctx, f);
+ p = isl_printer_set_yaml_style (p, ISL_YAML_STYLE_BLOCK);
+ p = isl_printer_print_space (p, space);
+ p = isl_printer_print_str (p, "\n");
+ isl_printer_free (p);
+}
+
+DEBUG_FUNCTION void
+debug_isl_space (__isl_keep isl_space *space)
+{
+ print_isl_space (stderr, space);
+}
+
void
print_isl_union_map (FILE *f, __isl_keep isl_union_map *map)
{
@@ -49,6 +49,10 @@ along with GCC; see the file COPYING3. If not see
#include "gimple-pretty-print.h"
#include "cfganal.h"
#include "graphite.h"
+#include "omp-general.h"
+#include "graphite-oacc.h"
+#include "print-tree.h"
+#include "internal-fn.h"
class debug_printer
{
@@ -630,7 +634,9 @@ scop_detection::can_represent_loop (loop_p loop, sese_l scop)
DEBUG_PRINT (dp << "[can_represent_loop-fail] Loop niter unknown.\n");
return false;
}
- if (!niter_desc.control.no_overflow)
+ /* TODO The zero niter can probably be allowed in general */
+ if (!niter_desc.control.no_overflow
+ && !(oacc_function_p (cfun) && integer_zerop (niter)))
{
DEBUG_PRINT (dp << "[can_represent_loop-fail] Loop niter can overflow.\n");
return false;
@@ -701,8 +707,7 @@ scop_detection::add_scop (sese_l s)
s.exit = single_succ_edge (s.exit->dest);
}
- /* Do not add scops with only one loop. */
- if (region_has_one_loop (s))
+ if (!oacc_function_p (cfun) && region_has_one_loop (s))
{
DEBUG_PRINT (dp << "[scop-detection-fail] Discarding one loop SCoP: ";
print_sese (dump_file, s));
@@ -1084,6 +1089,17 @@ scop_detection::stmt_has_simple_data_refs_p (sese_l scop, gimple *stmt)
return true;
}
+/* Check if STMT is a internal OpenACC function call that should be ignored when
+ Graphite checks side effects. */
+
+static inline bool
+ignored_oacc_internal_call_p (gimple *stmt)
+{
+ return is_gimple_call (stmt)
+ && (gimple_call_internal_p (stmt, IFN_UNIQUE)
+ || gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION));
+}
+
/* GIMPLE_ASM and GIMPLE_CALL may embed arbitrary side effects.
Calls have side-effects, except those to const or pure
functions. */
@@ -1091,6 +1107,9 @@ scop_detection::stmt_has_simple_data_refs_p (sese_l scop, gimple *stmt)
static bool
stmt_has_side_effects (gimple *stmt)
{
+ if (ignored_oacc_internal_call_p (stmt))
+ return false;
+
if (gimple_has_volatile_ops (stmt)
|| (gimple_code (stmt) == GIMPLE_CALL
&& !(gimple_call_flags (stmt) & (ECF_CONST | ECF_PURE)))
@@ -1288,6 +1307,7 @@ scan_tree_for_params (sese_info_p s, tree e)
case NEGATE_EXPR:
case BIT_NOT_EXPR:
CASE_CONVERT:
+ case VIEW_CONVERT_EXPR:
case NON_LVALUE_EXPR:
scan_tree_for_params (s, TREE_OPERAND (e, 0));
break;
@@ -1362,6 +1382,9 @@ find_scop_parameters (scop_p scop)
static void
add_write (vec<tree> *writes, tree def)
{
+ if (ignored_oacc_internal_call_p (SSA_NAME_DEF_STMT (def)))
+ return;
+
writes->safe_push (def);
DEBUG_PRINT (dp << "Adding scalar write: ";
print_generic_expr (dump_file, def);
@@ -1370,9 +1393,27 @@ add_write (vec<tree> *writes, tree def)
SSA_NAME_DEF_STMT (def), 0));
}
+static void
+add_kill (vec<tree> *kills, tree def)
+{
+ if (ignored_oacc_internal_call_p (SSA_NAME_DEF_STMT (def)))
+ return;
+
+ kills->safe_push (def);
+ DEBUG_PRINT (dp << "Adding scalar kill: ";
+ print_generic_expr (dump_file, def);
+ dp << "\n");
+}
+
static void
add_read (vec<scalar_use> *reads, tree use, gimple *use_stmt)
{
+ gcc_assert (TREE_CODE (use) == SSA_NAME);
+
+ if ((use_stmt && ignored_oacc_internal_call_p (use_stmt))
+ || ignored_oacc_internal_call_p (SSA_NAME_DEF_STMT (use)))
+ return;
+
DEBUG_PRINT (dp << "Adding scalar read: ";
print_generic_expr (dump_file, use);
dp << "\nFrom stmt: ";
@@ -1428,6 +1469,58 @@ build_cross_bb_scalars_use (scop_p scop, tree use, gimple *use_stmt,
add_read (reads, use, use_stmt);
}
+/* Add kills for all ssa names in vector FROM to vector KILLS. */
+
+static void add_kills (hash_set<tree>* from, vec<tree> &kills)
+{
+ hash_set<tree>::iterator end = from->end();
+ hash_set<tree>::iterator it = from->begin ();
+ for (; it != end; ++it)
+ {
+ tree var = *it;
+ add_kill (&kills, var);
+ }
+}
+
+/* Add kill operations for the privatized OpenACC variables that have been
+ recorded for SCOP for the basic block BB into the vector KILLS. */
+
+static void
+add_oacc_kills (scop_p scop, basic_block bb, vec<tree> &kills)
+{
+
+ loop_p loop = bb->loop_father;
+
+ /* Right now we only handle "firstprivate" and "private" variables that occur
+ on an OpenACC computer region. Those affect only the outermost and hence -
+ because of the "chunking" loop created in omp-expand.c around the original
+ loop - the two outermost CFG loops. */
+ if (loop_depth (loop) > 2)
+ return;
+
+ edge_iterator ei;
+ edge e;
+ FOR_EACH_EDGE (e, ei, bb->preds)
+ {
+ if (e->src == loop->header)
+ {
+ add_kills (scop->oacc_private_scalars, kills);
+ add_kills (scop->oacc_firstprivate_vars, kills);
+ break;
+ }
+ }
+
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ if (e->dest == loop->header)
+ {
+ add_kills (scop->oacc_private_scalars, kills);
+ add_kills (scop->oacc_firstprivate_vars, kills);
+ break;
+ }
+ }
+}
+
/* Generates a polyhedral black box only if the bb contains interesting
information. */
@@ -1436,6 +1529,7 @@ try_generate_gimple_bb (scop_p scop, basic_block bb)
{
vec<data_reference_p> drs = vNULL;
vec<tree> writes = vNULL;
+ vec<tree> kills = vNULL;
vec<scalar_use> reads = vNULL;
sese_l region = scop->scop_info->region;
@@ -1497,10 +1591,15 @@ try_generate_gimple_bb (scop_p scop, basic_block bb)
gsi_next (&psi))
{
gphi *phi = psi.phi ();
- tree res = gimple_phi_result (phi);
- if (virtual_operand_p (res))
- continue;
- /* To simulate out-of-SSA the predecessor of edges into PHI nodes
+ tree res = gimple_phi_result (phi);
+ if (virtual_operand_p (res))
+ continue;
+
+ if (scop->oacc_private_scalars->contains (res)
+ || scop->oacc_firstprivate_vars->contains (res))
+ continue;
+
+ /* To simulate out-of-SSA the predecessor of edges into PHI nodes
has a copy from the PHI argument to the PHI destination. */
if (! scev_analyzable_p (res, scop->scop_info->region))
add_write (&writes, res);
@@ -1536,10 +1635,15 @@ try_generate_gimple_bb (scop_p scop, basic_block bb)
}
}
- if (drs.is_empty () && writes.is_empty () && reads.is_empty ())
+ if (loop && /* i.e. BB belongs to SCOP. */
+ oacc_function_p (cfun))
+ add_oacc_kills (scop, bb, kills);
+
+ if (drs.is_empty () && writes.is_empty () && reads.is_empty ()
+ && kills.is_empty ())
return NULL;
- return new_gimple_poly_bb (bb, drs, reads, writes);
+ return new_gimple_poly_bb (bb, drs, reads, writes, kills);
}
/* Checks if all parts of DR are defined outside of REGION. This allows an
@@ -1800,10 +1904,21 @@ private:
auto_vec<gimple *, 3> conditions, cases;
scop_p scop;
};
-}
+
gather_bbs::gather_bbs (cdi_direction direction, scop_p scop, int *bb_to_rpo)
- : dom_walker (direction, ALL_BLOCKS, bb_to_rpo), scop (scop)
+ : dom_walker (direction, ALL_BLOCKS, bb_to_rpo), scop (scop)
{
+ if (oacc_function_p (cfun))
+ {
+ edge scop_entry = scop->scop_info->region.entry;
+ loop_p loop = scop_entry->dest->loop_father;
+ gcall *firstprivate_call = get_oacc_firstprivate_call (loop);
+ collect_oacc_privatized_vars (firstprivate_call,
+ *scop->oacc_firstprivate_vars);
+
+ gcall *private_call = get_oacc_private_scalars_call (loop);
+ collect_oacc_privatized_vars (private_call, *scop->oacc_private_scalars);
+ }
}
/* Call-back for dom_walk executed before visiting the dominated
@@ -1862,6 +1977,8 @@ gather_bbs::before_dom_children (basic_block bb)
data_reference_p dr;
FOR_EACH_VEC_ELT (gbb->data_refs, i, dr)
{
+ gcc_checking_assert (! ignored_oacc_internal_call_p (DR_STMT (dr)));
+
DEBUG_PRINT (dp << "Adding memory ";
if (dr->is_read)
dp << "read: ";
@@ -1897,6 +2014,8 @@ gather_bbs::after_dom_children (basic_block bb)
}
}
+}
+
/* Compute sth like an execution order, dominator order with first executing
edges that stay inside the current loop, delaying processing exit edges. */
@@ -1919,6 +2038,22 @@ cmp_pbbs (const void *pa, const void *pb)
return 0;
}
+/* Analyze the OpenACC loop structure surrounding SCOP to determine the ssa
+ names that belong to OpenACC reduction computations. */
+
+static void
+determine_openacc_reductions (scop_p scop)
+{
+ loop_p loop;
+ FOR_EACH_LOOP (loop, 0)
+ {
+ if (!loop_in_sese_p (loop, scop->scop_info->region))
+ continue;
+
+ collect_oacc_reduction_vars (loop, *scop->reduction_vars);
+ }
+}
+
/* Find Static Control Parts (SCoP) in the current function and pushes
them to SCOPS. */
@@ -1954,11 +2089,12 @@ build_scops (vec<scop_p> *scops)
/* Sort pbbs after execution order for initial schedule generation. */
scop->pbbs.qsort (cmp_pbbs);
- if (! build_alias_set (scop))
- {
- DEBUG_PRINT (dp << "[scop-detection-fail] cannot handle dependences\n");
- free_scop (scop);
- continue;
+ if (!build_alias_set (scop))
+ {
+ DEBUG_PRINT (dp
+ << "[scop-detection-fail] cannot handle dependences\n");
+ free_scop (scop);
+ continue;
}
/* Do not optimize a scop containing only PBBs that do not belong
@@ -1995,6 +2131,9 @@ build_scops (vec<scop_p> *scops)
continue;
}
+ if (oacc_function_p (cfun))
+ determine_openacc_reductions (scop);
+
scops->safe_push (scop);
}
@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see
#include "gimplify.h"
#include "gimplify-me.h"
#include "tree-cfg.h"
+#include "graphite-oacc.h"
#include "tree-ssa-loop-manip.h"
#include "tree-ssa-loop-niter.h"
#include "tree-ssa-loop.h"
@@ -46,6 +47,9 @@ along with GCC; see the file COPYING3. If not see
#include "tree-scalar-evolution.h"
#include "domwalk.h"
#include "tree-ssa-propagate.h"
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "internal-fn.h"
#include "graphite.h"
/* Return an isl identifier for the polyhedral basic block PBB. */
@@ -201,6 +205,8 @@ parameter_index_in_region (tree name, sese_info_p region)
return -1;
}
+tree oacc_ifn_call_extract (gimple*);
+
/* Extract an affine expression from the tree E in the scop S. */
static isl_pw_aff *
@@ -599,6 +605,21 @@ pdr_add_data_dimensions (isl_set *subscript_sizes, scop_p scop,
return isl_set_coalesce (subscript_sizes);
}
+static inline bool
+oacc_internal_call_p (gimple *stmt)
+{
+ if (!stmt || !is_gimple_call (stmt))
+ return false;
+
+ /* graphite-scop-detection.c should filter out those calls. */
+ gcc_assert (!gimple_call_internal_p (stmt, IFN_UNIQUE));
+
+ /* Should be handled by scalar evolution analysis. */
+ gcc_assert (!gimple_call_internal_p (stmt, IFN_GOACC_LOOP));
+
+ return false;
+}
+
/* Build data accesses for DRI. */
static void
@@ -635,13 +656,18 @@ build_poly_dr (dr_info &dri)
subscript_sizes = pdr_add_data_dimensions (subscript_sizes, scop, dr);
}
- new_poly_dr (pbb, DR_STMT (dr), DR_IS_READ (dr) ? PDR_READ : PDR_WRITE,
- acc, subscript_sizes);
+ if (oacc_internal_call_p (DR_STMT (dr)))
+ return;
+
+ bool is_reduction = scop->reduction_vars->contains (DR_BASE_ADDRESS (dr));
+ enum poly_dr_type dr_type = DR_IS_READ (dr) ? PDR_READ : PDR_WRITE;
+
+ new_poly_dr (pbb, DR_STMT (dr), dr_type, acc, subscript_sizes, is_reduction);
}
static void
build_poly_sr_1 (poly_bb_p pbb, gimple *stmt, tree var, enum poly_dr_type kind,
- isl_map *acc, isl_set *subscript_sizes)
+ isl_map *acc, isl_set *subscript_sizes, bool is_reduction)
{
scop_p scop = PBB_SCOP (pbb);
/* Each scalar variable has a unique alias set number starting from
@@ -658,7 +684,7 @@ build_poly_sr_1 (poly_bb_p pbb, gimple *stmt, tree var, enum poly_dr_type kind,
c = isl_constraint_set_coefficient_si (c, isl_dim_out, 0, 1);
new_poly_dr (pbb, stmt, kind, isl_map_add_constraint (acc, c),
- subscript_sizes);
+ subscript_sizes, is_reduction);
}
/* Record all cross basic block scalar variables in PBB. */
@@ -670,6 +696,7 @@ build_poly_sr (poly_bb_p pbb)
gimple_poly_bb_p gbb = PBB_BLACK_BOX (pbb);
vec<scalar_use> &reads = gbb->read_scalar_refs;
vec<tree> &writes = gbb->write_scalar_refs;
+ vec<tree> &kills = gbb->kill_scalar_refs;
isl_space *dc = isl_set_get_space (pbb->domain);
int nb_out = 1;
@@ -684,13 +711,39 @@ build_poly_sr (poly_bb_p pbb)
int i;
tree var;
FOR_EACH_VEC_ELT (writes, i, var)
+ {
+ if (oacc_internal_call_p (SSA_NAME_DEF_STMT (var)))
+ continue;
+
+ bool is_reduction = scop->reduction_vars->contains (var);
+
build_poly_sr_1 (pbb, SSA_NAME_DEF_STMT (var), var, PDR_WRITE,
- isl_map_copy (acc), isl_set_copy (subscript_sizes));
+ isl_map_copy (acc), isl_set_copy (subscript_sizes),
+ is_reduction);
+ }
+
+ FOR_EACH_VEC_ELT (kills, i, var)
+ {
+ build_poly_sr_1 (pbb, NULL, var, PDR_KILL,
+ isl_map_copy (acc), isl_set_copy (subscript_sizes),
+ false);
+ }
scalar_use *use;
FOR_EACH_VEC_ELT (reads, i, use)
+ {
+ tree use_var = use->second;
+ gcc_checking_assert (TREE_CODE (use_var) == SSA_NAME);
+
+ if (oacc_internal_call_p (use->first)
+ || oacc_internal_call_p (SSA_NAME_DEF_STMT (use->second)))
+ continue;
+
+ bool is_reduction = scop->reduction_vars->contains (use->second);
+
build_poly_sr_1 (pbb, use->first, use->second, PDR_READ, isl_map_copy (acc),
- isl_set_copy (subscript_sizes));
+ isl_set_copy (subscript_sizes), is_reduction);
+ }
isl_map_free (acc);
isl_set_free (subscript_sizes);
@@ -43,6 +43,8 @@ along with GCC; see the file COPYING3. If not see
#include "cfghooks.h"
#include "tree.h"
#include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimplify-me.h"
#include "ssa.h"
#include "fold-const.h"
#include "gimple-iterator.h"
@@ -58,6 +60,14 @@ along with GCC; see the file COPYING3. If not see
#include "tree-ssa.h"
#include "tree-into-ssa.h"
#include "graphite.h"
+#include "graphite-oacc.h"
+#include "cgraph.h"
+#include "gimple-pretty-print.h"
+#include "print-tree.h"
+#include "tree-pretty-print.h"
+#include "internal-fn.h"
+
+static bool have_isl = true;
/* Print global statistics to FILE. */
@@ -417,9 +427,12 @@ graphite_transform_loops (void)
vec<scop_p> scops = vNULL;
isl_ctx *ctx;
- /* If a function is parallel it was most probably already run through graphite
- once. No need to run again. */
- if (parallelized_function_p (cfun->decl))
+ /* If a function is parallel it was most probably already run through
+ graphite once. No need to run again. This is not true for OpenACC
+ functions. The function was created for offloading, bu we still might have
+ to figure out which loops may be parallelized. */
+
+ if (parallelized_function_p (cfun->decl) && !oacc_function_p (cfun))
return;
calculate_dominance_info (CDI_DOMINATORS);
@@ -445,6 +458,7 @@ graphite_transform_loops (void)
seir_cache = new hash_map<sese_scev_hash, tree>;
calculate_dominance_info (CDI_POST_DOMINATORS);
+ set_scev_analyze_openacc_calls (oacc_function_p (cfun));
build_scops (&scops);
free_dominance_info (CDI_POST_DOMINATORS);
@@ -458,26 +472,50 @@ graphite_transform_loops (void)
print_global_statistics (dump_file);
}
- FOR_EACH_VEC_ELT (scops, i, scop)
- if (dbg_cnt (graphite_scop))
- {
- scop->isl_context = ctx;
- if (!build_poly_scop (scop))
- continue;
-
- if (!apply_poly_transforms (scop))
- continue;
-
- changed = true;
- if (graphite_regenerate_ast_isl (scop)
- && dump_enabled_p ())
- {
- dump_user_location_t loc = find_loop_location
- (scops[i]->scop_info->region.entry->dest->loop_father);
- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
- "loop nest optimized\n");
- }
- }
+ if (oacc_function_p (cfun))
+ {
+ /* OpenACC uses Graphite for dependence analysis only.
+ Code generation would need not to understand the
+ OpenACC internal function calls before it could be
+ enabled. */
+
+ FOR_EACH_VEC_ELT (scops, i, scop)
+ if (dbg_cnt (graphite_scop))
+ {
+ scop->isl_context = ctx;
+ if (!build_poly_scop (scop))
+ continue;
+
+ if (!optimize_isl (scop, true))
+ continue;
+
+ graphite_oacc_analyze_scop (scop);
+ changed = true;
+ }
+ set_scev_analyze_openacc_calls (false);
+ }
+ else // Non-OpenACC-functions
+ {
+ FOR_EACH_VEC_ELT (scops, i, scop)
+ if (dbg_cnt (graphite_scop))
+ {
+ scop->isl_context = ctx;
+ if (!build_poly_scop (scop))
+ continue;
+
+ if (!apply_poly_transforms (scop))
+ continue;
+
+ changed = true;
+ if (graphite_regenerate_ast_isl (scop) && dump_enabled_p ())
+ {
+ dump_user_location_t loc = find_loop_location (
+ scops[i]->scop_info->region.entry->dest->loop_father);
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
+ "loop nest optimized\n");
+ }
+ }
+ }
delete seir_cache;
seir_cache = NULL;
@@ -520,6 +558,8 @@ graphite_transform_loops (void)
#else /* If isl is not available: #ifndef HAVE_isl. */
+static bool have_isl = false;
+
static void
graphite_transform_loops (void)
{
@@ -532,7 +572,10 @@ graphite_transform_loops (void)
static unsigned int
graphite_transforms (struct function *fun)
{
- if (number_of_loops (fun) <= 1)
+
+ unsigned num_loops = number_of_loops (fun);
+ if (num_loops == 0
+ || (num_loops == 1 && !oacc_function_p (cfun)))
return 0;
graphite_transform_loops ();
@@ -540,14 +583,35 @@ graphite_transforms (struct function *fun)
return 0;
}
+/* Return TRUE if fun is an OpenACC outlined function that should be analyzed
+ by Graphite. */
+
+static inline bool oacc_enable_graphite_p (function *fun)
+{
+ if (!flag_openacc || !oacc_get_fn_attrib (fun->decl))
+ return false;
+
+ if (!graphite_analyze_oacc_target_region_type_p (fun))
+ return false;
+
+ bool optimizing = global_options.x_optimize <= 0;
+ /* Enabling Graphite if isl is not available aborts compilation. Prefer to
+ skip it and emit a warning, unless optimizations are enabled. */
+ if (!have_isl && !optimizing)
+ warning (OPT_Wall, "Unable to analyze OpenACC regions with Graphite; isl "
+ "is not available.");
+ return true;
+}
+
static bool
-gate_graphite_transforms (void)
+gate_graphite_transforms (function *fun)
{
/* Enable -fgraphite pass if any one of the graphite optimization flags
is turned on. */
if (flag_graphite_identity
|| flag_loop_parallelize_all
- || flag_loop_nest_optimize)
+ || flag_loop_nest_optimize
+ || oacc_enable_graphite_p (fun))
flag_graphite = 1;
return flag_graphite != 0;
@@ -576,7 +640,7 @@ public:
{}
/* opt_pass methods: */
- virtual bool gate (function *) { return gate_graphite_transforms (); }
+ virtual bool gate (function *fun) { return gate_graphite_transforms (fun); }
}; // class pass_graphite
@@ -611,7 +675,7 @@ public:
{}
/* opt_pass methods: */
- virtual bool gate (function *) { return gate_graphite_transforms (); }
+ virtual bool gate (function *fun) { return gate_graphite_transforms (fun); }
virtual unsigned int execute (function *fun) { return graphite_transforms (fun); }
}; // class pass_graphite_transforms
@@ -42,7 +42,8 @@ enum poly_dr_type
/* PDR_MAY_READs are represented using PDR_READS. This does not
limit the expressiveness. */
PDR_WRITE,
- PDR_MAY_WRITE
+ PDR_MAY_WRITE,
+ PDR_KILL
};
struct poly_dr
@@ -61,6 +62,9 @@ struct poly_dr
enum poly_dr_type type;
+ /* Indicates that this PDR is part of an OpenACC "reduction" computation. */
+ bool is_reduction;
+
/* The access polyhedron contains the polyhedral space this data
reference will access.
@@ -185,7 +189,7 @@ struct poly_dr
#define PDR_ACCESSES(PDR) (NULL)
void new_poly_dr (poly_bb_p, gimple *, enum poly_dr_type,
- isl_map *, isl_set *);
+ isl_map *, isl_set *, bool);
void debug_pdr (poly_dr_p);
void print_pdr (FILE *, poly_dr_p);
@@ -211,6 +215,14 @@ pdr_may_write_p (poly_dr_p pdr)
return PDR_TYPE (pdr) == PDR_MAY_WRITE;
}
+/* Returns true when PDR is a "kill". */
+
+static inline bool
+pdr_kill_p (poly_dr_p pdr)
+{
+ return PDR_TYPE (pdr) == PDR_KILL;
+}
+
/* POLY_BB represents a blackbox in the polyhedral model. */
struct poly_bb
@@ -281,6 +293,8 @@ extern void print_isl_aff (FILE *, isl_aff *);
extern void print_isl_constraint (FILE *, isl_constraint *);
extern void print_isl_schedule (FILE *, isl_schedule *);
extern void debug_isl_schedule (isl_schedule *);
+extern void print_isl_space (FILE *, isl_space *);
+extern void debug_isl_space (isl_space *);
extern void print_isl_ast (FILE *, isl_ast_node *);
extern void debug_isl_ast (isl_ast_node *);
extern void debug_isl_set (isl_set *);
@@ -380,6 +394,18 @@ struct scop
/* All the data references in this scop. */
vec<dr_info> drs;
+ /* This set contains the ssa names that are OpenACC "reduction" variables
+ in the loops from SCOP using them. */
+ hash_set<tree> *reduction_vars;
+
+ /* If SCOP is contained in an OpenACC compute region, this is the set of
+ ssa names that are "firstprivate" in this region. */
+ hash_set<tree> *oacc_firstprivate_vars;
+
+ /* If SCOP is contained in an OpenACC compute region, this is the set of
+ ssa names that are "private" in this region. */
+ hash_set<tree> *oacc_private_scalars;
+
/* The context describes known restrictions concerning the parameters
and relations in between the parameters.
@@ -411,7 +437,8 @@ struct scop
extern scop_p new_scop (edge, edge);
extern void free_scop (scop_p);
extern gimple_poly_bb_p new_gimple_poly_bb (basic_block, vec<data_reference_p>,
- vec<scalar_use>, vec<tree>);
+ vec<scalar_use>, vec<tree>, vec<tree>);
+extern bool optimize_isl (scop_p, bool = false);
extern bool apply_poly_transforms (scop_p);
/* Set the region of SCOP to REGION. */
@@ -447,10 +474,10 @@ carries_deps (__isl_keep isl_union_map *schedule,
extern bool build_poly_scop (scop_p);
extern bool graphite_regenerate_ast_isl (scop_p);
+extern bool graphite_oacc_analyze_scop (scop_p);
extern void build_scops (vec<scop_p> *);
extern tree cached_scalar_evolution_in_region (const sese_l &, loop_p, tree);
extern void dot_all_sese (FILE *, vec<sese_l> &);
extern void dot_sese (sese_l &);
extern void dot_cfg ();
-
#endif
@@ -2970,6 +2970,8 @@ expand_UNIQUE (internal_fn, gcall *stmt)
gcc_unreachable ();
break;
case IFN_UNIQUE_OACC_PRIVATE:
+ case IFN_UNIQUE_OACC_PRIVATE_SCALAR:
+ case IFN_UNIQUE_OACC_FIRSTPRIVATE:
break;
}
@@ -37,7 +37,9 @@ along with GCC; see the file COPYING3. If not see
DEF(UNSPEC), \
DEF(OACC_FORK), DEF(OACC_JOIN), \
DEF(OACC_HEAD_MARK), DEF(OACC_TAIL_MARK), \
- DEF(OACC_PRIVATE)
+ DEF(OACC_PRIVATE), \
+ DEF(OACC_PRIVATE_SCALAR), \
+ DEF(OACC_FIRSTPRIVATE)
enum ifn_unique_kind {
#define DEF(X) IFN_UNIQUE_##X
@@ -108,7 +108,8 @@ struct omp_region
a depend clause. */
gomp_ordered *ord_stmt;
- /* True if this is nested inside an OpenACC kernels construct. */
+ /* True if this is nested inside an OpenACC kernels construct that
+ will be handled by the "parloops" pass. */
bool inside_kernels_p;
};
@@ -8153,13 +8154,35 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
loops_state_set (LOOPS_NEED_FIXUP);
if (region->inside_kernels_p)
- expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
- NULL_TREE, inner_stmt);
+ {
+ gcc_checking_assert (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+ expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+ NULL_TREE, inner_stmt);
+ }
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_SIMD)
expand_omp_simd (region, &fd);
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
{
- gcc_assert (!inner_stmt && !fd.non_rect);
+ struct omp_region *target_region;
+ for (target_region = region->outer; target_region;
+ target_region = target_region->outer)
+ {
+ if (region->type == GIMPLE_OMP_TARGET)
+ {
+ gomp_target *entry_stmt
+ = as_a<gomp_target *> (last_stmt (target_region->entry));
+
+ if (gimple_omp_target_kind (entry_stmt)
+ == GF_OMP_TARGET_KIND_OACC_KERNELS)
+ gcc_checking_assert (
+ param_openacc_kernels != OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ && param_openacc_kernels != OPENACC_KERNELS_PARLOOPS);
+ }
+ }
+
+ gcc_assert (!inner_stmt);
expand_oacc_for (region, &fd);
}
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_TASKLOOP)
@@ -9564,6 +9587,10 @@ static void
mark_loops_in_oacc_kernels_region (basic_block region_entry,
basic_block region_exit)
{
+ gcc_checking_assert (param_openacc_kernels
+ == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
class loop *outer = region_entry->loop_father;
gcc_assert (region_exit == NULL || outer == region_exit->loop_father);
@@ -9728,23 +9755,28 @@ expand_omp_target (struct omp_region *region)
entry_stmt = as_a <gomp_target *> (last_stmt (region->entry));
target_kind = gimple_omp_target_kind (entry_stmt);
+ if (!(param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS))
+ gcc_checking_assert (target_kind != GF_OMP_TARGET_KIND_OACC_KERNELS);
+
new_bb = region->entry;
offloaded = is_gimple_omp_offloaded (entry_stmt);
switch (target_kind)
{
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+ case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_REGION:
case GF_OMP_TARGET_KIND_UPDATE:
case GF_OMP_TARGET_KIND_ENTER_DATA:
case GF_OMP_TARGET_KIND_EXIT_DATA:
- case GF_OMP_TARGET_KIND_OACC_PARALLEL:
case GF_OMP_TARGET_KIND_OACC_KERNELS:
- case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_UPDATE:
case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
case GF_OMP_TARGET_KIND_OACC_DECLARE:
- case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
- case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
case GF_OMP_TARGET_KIND_DATA:
case GF_OMP_TARGET_KIND_OACC_DATA:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
@@ -9784,6 +9816,12 @@ expand_omp_target (struct omp_region *region)
NULL_TREE, DECL_ATTRIBUTES (child_fn));
break;
case GF_OMP_TARGET_KIND_OACC_KERNELS:
+ gcc_checking_assert (
+ param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+ mark_loops_in_oacc_kernels_region (region->entry, region->exit);
+
DECL_ATTRIBUTES (child_fn)
= tree_cons (get_identifier ("oacc kernels"),
NULL_TREE, DECL_ATTRIBUTES (child_fn));
@@ -9803,6 +9841,11 @@ expand_omp_target (struct omp_region *region)
= tree_cons (get_identifier ("oacc parallel_kernels_gang_single"),
NULL_TREE, DECL_ATTRIBUTES (child_fn));
break;
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+ DECL_ATTRIBUTES (child_fn)
+ = tree_cons (get_identifier ("oacc parallel_kernels_graphite"),
+ NULL_TREE, DECL_ATTRIBUTES (child_fn));
+ break;
default:
/* Make sure we don't miss any. */
gcc_checking_assert (!(is_gimple_omp_oacc (entry_stmt)
@@ -10015,6 +10058,7 @@ expand_omp_target (struct omp_region *region)
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
start_ix = BUILT_IN_GOACC_PARALLEL;
break;
case GF_OMP_TARGET_KIND_OACC_DATA:
@@ -10517,14 +10561,15 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
- break;
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+ case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
+ break;
case GF_OMP_TARGET_KIND_UPDATE:
case GF_OMP_TARGET_KIND_ENTER_DATA:
case GF_OMP_TARGET_KIND_EXIT_DATA:
case GF_OMP_TARGET_KIND_DATA:
case GF_OMP_TARGET_KIND_OACC_DATA:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
- case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
case GF_OMP_TARGET_KIND_OACC_UPDATE:
case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
case GF_OMP_TARGET_KIND_OACC_DECLARE:
@@ -10706,7 +10751,10 @@ public:
/* opt_pass methods: */
virtual bool gate (function *fun)
{
- return !(fun->curr_properties & PROP_gimple_eomp);
+ return !(fun->curr_properties & PROP_gimple_eomp)
+ && (!oacc_get_kernels_attrib (cfun->decl)
+ || param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
}
virtual unsigned int execute (function *) { return execute_expand_omp (); }
opt_pass * clone () { return new pass_expand_omp_ssa (m_ctxt); }
@@ -10776,6 +10824,8 @@ omp_make_gimple_edges (basic_block bb, struct omp_region **region,
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+ case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
break;
case GF_OMP_TARGET_KIND_UPDATE:
case GF_OMP_TARGET_KIND_ENTER_DATA:
@@ -10783,7 +10833,6 @@ omp_make_gimple_edges (basic_block bb, struct omp_region **region,
case GF_OMP_TARGET_KIND_DATA:
case GF_OMP_TARGET_KIND_OACC_DATA:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
- case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
case GF_OMP_TARGET_KIND_OACC_UPDATE:
case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
case GF_OMP_TARGET_KIND_OACC_DECLARE:
@@ -2929,6 +2929,15 @@ oacc_get_fn_attrib (tree fn)
return lookup_attribute (OACC_FN_ATTRIB, DECL_ATTRIBUTES (fn));
}
+/* Retrieve the oacc kernels attrib and return it. Non-oacc
+ functions will return NULL. */
+
+tree
+oacc_get_kernels_attrib (tree fn)
+{
+ return lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn));
+}
+
/* Return true if FN is an OpenMP or OpenACC offloading function. */
bool
@@ -2955,10 +2964,16 @@ oacc_get_fn_dim_size (tree fn, int axis)
dims = TREE_CHAIN (dims);
tree v = TREE_VALUE (dims);
- /* TODO With 'pass_oacc_device_lower' moved "later", this is necessary to
- avoid ICE for some OpenACC 'kernels' ("parloops") constructs. */
+ /* TODO-kernels With 'pass_oacc_device_lower' moved "later", this is necessary
+ to avoid ICE for some OpenACC 'kernels' ("parloops") constructs. */
if (v == NULL_TREE)
- return 0;
+ {
+ gcc_checking_assert (
+ param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+ return 0;
+ }
int size = TREE_INT_CST_LOW (v);
@@ -120,6 +120,7 @@ extern int oacc_verify_routine_clauses (tree, tree *, location_t,
const char *);
extern tree oacc_build_routine_dims (tree clauses);
extern tree oacc_get_fn_attrib (tree fn);
+extern tree oacc_get_kernels_attrib (tree fn);
extern bool offloading_function_p (tree fn);
extern int oacc_get_fn_dim_size (tree fn, int axis);
extern int oacc_get_ifn_dim_arg (const gimple *stmt);
@@ -157,6 +157,12 @@ struct omp_context
/* Addressable variable decls in this context. */
vec<tree> *oacc_addressable_var_decls;
+ /* "firstprivate" variables in this context */
+ hash_set<tree> *oacc_firstprivate_vars;
+
+ /* Scalar "private" variables in this context. */
+ hash_set<tree> *oacc_private_scalars;
+
/* True if lower_omp_1 should look up lastprivate conditional in parent
context. */
bool combined_into_simd_safelen1;
@@ -220,7 +226,27 @@ is_oacc_parallel_or_serial (omp_context *ctx)
|| (gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
|| (gimple_omp_target_kind (ctx->stmt)
- == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)));
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+ || (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)));
+}
+
+/* Return true if CTX corresponds to an oacc region that was generated from
+ an original kernels region that has been lowered to parallel regions. */
+
+static bool
+was_originally_oacc_kernels (omp_context *ctx)
+{
+ enum gimple_code outer_type = gimple_code (ctx->stmt);
+ return ((outer_type == GIMPLE_OMP_TARGET)
+ && ((gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+ || (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+ || (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
+ || (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_DATA_KERNELS)));
}
/* Return whether CTX represents an OpenACC 'kernels' construct.
@@ -246,10 +272,23 @@ is_oacc_kernels_decomposed_part (omp_context *ctx)
== GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
|| (gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+ || (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
|| (gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_DATA_KERNELS)));
}
+/* Return whether CTX represents an OpenACC 'kernels' decomposed part that will
+ be analyzed by Graphite. */
+
+static bool
+is_oacc_kernels_decomposed_graphite_part (omp_context *ctx)
+{
+ return gimple_code (ctx->stmt) == GIMPLE_OMP_TARGET
+ && gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE;
+}
+
/* Return true if STMT corresponds to an OpenMP target region. */
static bool
is_omp_target (gimple *stmt)
@@ -1139,6 +1178,8 @@ new_omp_context (gimple *stmt, omp_context *outer_ctx)
ctx->cb.decl_map = new hash_map<tree, tree>;
ctx->oacc_addressable_var_decls = new vec<tree> ();
+ ctx->oacc_firstprivate_vars = new hash_set<tree> ();
+ ctx->oacc_private_scalars = new hash_set<tree> ();
return ctx;
}
@@ -1224,6 +1265,8 @@ delete_omp_context (splay_tree_value value)
delete ctx->allocate_map;
delete ctx->oacc_addressable_var_decls;
+ delete ctx->oacc_firstprivate_vars;
+ delete ctx->oacc_private_scalars;
XDELETE (ctx);
}
@@ -1286,6 +1329,43 @@ fixup_child_record_type (omp_context *ctx)
= build_qualified_type (build_reference_type (type), TYPE_QUAL_RESTRICT);
}
+static void
+oacc_record_firstprivate_var_clauses (omp_context *ctx, tree clauses)
+{
+ tree c;
+
+ for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE)
+ {
+ tree decl = OMP_CLAUSE_DECL (c);
+
+ if (TREE_ADDRESSABLE (decl))
+ continue;
+
+ ctx->oacc_firstprivate_vars->add (decl);
+ }
+}
+
+static void
+oacc_record_private_scalars (omp_context *ctx, tree clauses)
+{
+ tree c;
+
+ for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_PRIVATE)
+ {
+ tree decl = OMP_CLAUSE_DECL (c);
+ if (!(VAR_P (decl)
+ && !(TREE_READONLY (decl)
+ && (TREE_STATIC (decl) || DECL_EXTERNAL (decl)))))
+ continue;
+
+ if (TREE_ADDRESSABLE (decl))
+ continue;
+ ctx->oacc_private_scalars->add (decl);
+ }
+}
+
/* Instantiate decls as necessary in CTX to satisfy the data sharing
specified by CLAUSES. If BASE_POINTERS_RESTRICT, install var field with
restrict. */
@@ -1901,9 +1981,15 @@ scan_sharing_clauses (tree clauses, omp_context *ctx,
break;
/* FALLTHRU */
- case OMP_CLAUSE_FIRSTPRIVATE:
- case OMP_CLAUSE_PRIVATE:
- case OMP_CLAUSE_LINEAR:
+ case OMP_CLAUSE_FIRSTPRIVATE:
+ if (is_oacc_kernels_decomposed_graphite_part (ctx))
+ oacc_record_firstprivate_var_clauses (ctx, c);
+ gcc_fallthrough ();
+ case OMP_CLAUSE_PRIVATE:
+ if (is_oacc_kernels_decomposed_graphite_part (ctx))
+ oacc_record_private_scalars (ctx, c);
+ gcc_fallthrough ();
+ case OMP_CLAUSE_LINEAR:
case OMP_CLAUSE_IS_DEVICE_PTR:
decl = OMP_CLAUSE_DECL (c);
if (is_variable_sized (decl))
@@ -2766,12 +2852,21 @@ enclosing_target_ctx (omp_context *ctx)
static bool
ctx_in_oacc_kernels_region (omp_context *ctx)
{
+ gcc_checking_assert (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE
+ || param_openacc_kernels
+ == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
for (;ctx != NULL; ctx = ctx->outer)
{
gimple *stmt = ctx->stmt;
- if (gimple_code (stmt) == GIMPLE_OMP_TARGET
- && gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
- return true;
+ if (gimple_code (stmt) != GIMPLE_OMP_TARGET)
+ continue;
+
+ int target_kind = gimple_omp_target_kind (stmt);
+ if (target_kind == GF_OMP_TARGET_KIND_OACC_KERNELS
+ || target_kind == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
+ return true;
}
return false;
@@ -2785,6 +2880,10 @@ ctx_in_oacc_kernels_region (omp_context *ctx)
static unsigned
check_oacc_kernel_gwv (gomp_for *stmt, omp_context *ctx)
{
+ gcc_checking_assert (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
bool checking = true;
unsigned outer_mask = 0;
unsigned this_mask = 0;
@@ -2856,9 +2955,11 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
{
omp_context *tgt = enclosing_target_ctx (outer_ctx);
- if (!(tgt && is_oacc_kernels (tgt)))
- for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
- {
+ if (!tgt
+ || (is_oacc_parallel_or_serial (tgt)
+ && !was_originally_oacc_kernels (tgt)))
+ for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ {
tree c_op0;
switch (OMP_CLAUSE_CODE (c))
{
@@ -3393,11 +3494,14 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
/* No nesting of non-OpenACC STMT (that is, an OpenMP one, or a GOMP builtin)
inside an OpenACC CTX. */
- if (!(is_gimple_omp (stmt)
- && is_gimple_omp_oacc (stmt))
+ if (!(is_gimple_omp (stmt) && is_gimple_omp_oacc (stmt))
/* Except for atomic codes that we share with OpenMP. */
&& !(gimple_code (stmt) == GIMPLE_OMP_ATOMIC_LOAD
- || gimple_code (stmt) == GIMPLE_OMP_ATOMIC_STORE))
+ || gimple_code (stmt) == GIMPLE_OMP_ATOMIC_STORE)
+ /* Except for target regions introduced for kernels. */
+ && (gimple_code (stmt) != GIMPLE_OMP_TARGET
+ || gimple_omp_target_kind (stmt)
+ != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE))
{
if (oacc_get_fn_attrib (cfun->decl) != NULL)
{
@@ -3568,6 +3672,7 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
ok = true;
break;
@@ -4065,6 +4170,7 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
/* OpenACC 'kernels' decomposed parts. */
stmt_name = "kernels"; break;
@@ -4085,6 +4191,7 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
ctx_stmt_name = "host_data"; break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
/* OpenACC 'kernels' decomposed parts. */
ctx_stmt_name = "kernels"; break;
@@ -4092,10 +4199,12 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
}
/* OpenACC/OpenMP mismatch? */
- if (is_gimple_omp_oacc (stmt)
- != is_gimple_omp_oacc (ctx->stmt))
- {
- error_at (gimple_location (stmt),
+ if (is_gimple_omp_oacc (stmt) != is_gimple_omp_oacc (ctx->stmt)
+ && (gimple_code (stmt) != GIMPLE_OMP_TARGET
+ || gimple_omp_target_kind (stmt)
+ != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE))
+ {
+ error_at (gimple_location (stmt),
"%s %qs construct inside of %s %qs region",
(is_gimple_omp_oacc (stmt)
? "OpenACC" : "OpenMP"), stmt_name,
@@ -7673,9 +7782,11 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *body_p,
static void
lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
- gcall *fork, gcall *private_marker, gcall *join,
- gimple_seq *fork_seq, gimple_seq *join_seq,
- omp_context *ctx)
+ gcall *fork, gcall *private_marker,
+ gcall *private_scalars_marker,
+ gcall *firstprivate_marker, gcall *join,
+ gimple_seq *fork_seq, gimple_seq *join_seq,
+ omp_context *ctx)
{
gimple_seq before_fork = NULL;
gimple_seq after_fork = NULL;
@@ -7691,9 +7802,11 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
/* No 'reduction' clauses on OpenACC 'kernels'. */
gcc_checking_assert (!is_oacc_kernels (ctx));
/* Likewise, on OpenACC 'kernels' decomposed parts. */
- gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
+ gcc_checking_assert (
+ !is_oacc_kernels_decomposed_part (ctx)
+ || is_oacc_kernels_decomposed_graphite_part (ctx));
- tree orig = OMP_CLAUSE_DECL (c);
+ tree orig = OMP_CLAUSE_DECL (c);
tree orig_clause;
tree var;
tree ref_to_res = NULL_TREE;
@@ -7896,7 +8009,12 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
gimple_seq_add_stmt (fork_seq, fork);
gimple_seq_add_seq (fork_seq, after_fork);
+ if (private_scalars_marker)
+ gimple_seq_add_stmt (join_seq, private_scalars_marker);
+ if (firstprivate_marker)
+ gimple_seq_add_stmt (join_seq, firstprivate_marker);
gimple_seq_add_seq (join_seq, before_join);
+
if (join)
gimple_seq_add_stmt (join_seq, join);
gimple_seq_add_seq (join_seq, after_join);
@@ -8609,16 +8727,27 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
/* In a parallel region, loops without auto and seq clauses are
implicitly INDEPENDENT. */
- if ((!tgt || is_oacc_parallel_or_serial (tgt))
+ if ((!tgt
+ || (is_oacc_parallel_or_serial (tgt)
+ && !is_oacc_kernels_decomposed_graphite_part (tgt)))
&& !(tag & (OLF_SEQ | OLF_AUTO)))
- tag |= OLF_INDEPENDENT;
+ {
+ tag |= OLF_INDEPENDENT;
+ }
/* Loops inside OpenACC 'kernels' decomposed parts' regions are expected to
have an explicit 'seq' or 'independent' clause, and no 'auto' clause. */
- if (tgt && is_oacc_kernels_decomposed_part (tgt))
+ if (tgt && is_oacc_kernels_decomposed_part (tgt)
+ && !is_oacc_kernels_decomposed_graphite_part (tgt))
{
- gcc_assert (tag & (OLF_SEQ | OLF_INDEPENDENT));
- gcc_assert (!(tag & OLF_AUTO));
+ tag |= OLF_INDEPENDENT;
+
+ gcc_checking_assert (
+ gimple_code (ctx->stmt) != GIMPLE_OMP_TARGET
+ /* Loops in kernels regions that will be handled by Graphite should
+ have been made 'auto' by "pass_convert_oacc_kernels". */
+ || gimple_omp_target_kind (ctx->stmt)
+ != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE);
}
if (tag & OLF_TILE)
@@ -8673,7 +8802,9 @@ lower_oacc_loop_marker (location_t loc, tree ddvar, bool head,
static void
lower_oacc_head_tail (location_t loc, tree clauses, gcall *private_marker,
- gimple_seq *head, gimple_seq *tail, omp_context *ctx)
+ gcall *private_scalars_marker,
+ gcall *firstprivate_marker, gimple_seq *head,
+ gimple_seq *tail, omp_context *ctx)
{
bool inner = false;
tree ddvar = create_tmp_var (integer_type_node, ".data_dep");
@@ -8688,6 +8819,20 @@ lower_oacc_head_tail (location_t loc, tree clauses, gcall *private_marker,
gimple_call_set_arg (private_marker, 1, ddvar);
}
+ if (private_scalars_marker)
+ {
+ gimple_set_location (private_scalars_marker, loc);
+ gimple_call_set_lhs (private_scalars_marker, ddvar);
+ gimple_call_set_arg (private_scalars_marker, 1, ddvar);
+ }
+
+ if (firstprivate_marker)
+ {
+ gimple_set_location (firstprivate_marker, loc);
+ gimple_call_set_lhs (firstprivate_marker, ddvar);
+ gimple_call_set_arg (firstprivate_marker, 1, ddvar);
+ }
+
tree fork_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_FORK);
tree join_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_JOIN);
@@ -8718,9 +8863,10 @@ lower_oacc_head_tail (location_t loc, tree clauses, gcall *private_marker,
build_int_cst (integer_type_node, done),
&join_seq);
- lower_oacc_reductions (loc, clauses, place, inner,
- fork, (count == 1) ? private_marker : NULL,
- join, &fork_seq, &join_seq, ctx);
+ lower_oacc_reductions (loc, clauses, place, inner, fork,
+ (count == 1) ? private_marker : NULL,
+ private_scalars_marker, firstprivate_marker, join,
+ &fork_seq, &join_seq, ctx);
/* Append this level to head. */
gimple_seq_add_seq (head, fork_seq);
@@ -11721,6 +11867,76 @@ make_oacc_private_marker (omp_context *ctx)
return gimple_build_call_internal_vec (IFN_UNIQUE, args);
}
+/* Return an internal function call that contains a list of variables which are
+ "firstprivate" in the compute region representend by CTX. This call is used
+ to help Graphite identify those static. */
+
+static gcall *
+make_oacc_firstprivate_vars_marker (omp_context *ctx)
+{
+ auto_vec<tree, 5> args;
+
+ args.quick_push (
+ build_int_cst (integer_type_node, IFN_UNIQUE_OACC_FIRSTPRIVATE));
+
+ /* TODO Change the data structure/iteration to ensure that the ordering of the
+ variables remains stable between GCC runs. */
+ hash_set<tree>::iterator end = ctx->oacc_firstprivate_vars->end();
+ hash_set<tree>::iterator it = ctx->oacc_firstprivate_vars->begin ();
+ for (; it != end; ++it)
+ {
+ tree decl = *it;
+ for (omp_context *thisctx = ctx; thisctx; thisctx = thisctx->outer)
+ {
+ tree inner_decl = maybe_lookup_decl (decl, thisctx);
+ if (inner_decl)
+ {
+ decl = inner_decl;
+ break;
+ }
+ }
+
+ args.safe_push (decl);
+ }
+
+ return gimple_build_call_internal_vec (IFN_UNIQUE, args);
+}
+
+/* Return an internal function call that contains a list of scalar variables
+ which are "private" in the compute region represented by CTX. This call is
+ used to help Graphite identify those variables. */
+
+static gcall *
+make_oacc_private_scalars_marker (omp_context *ctx)
+{
+ auto_vec<tree, 5> args;
+
+ args.quick_push (
+ build_int_cst (integer_type_node, IFN_UNIQUE_OACC_PRIVATE_SCALAR));
+
+ /* TODO Change the data structure/iteration to ensure that the ordering of
+ the variables remains stable between GCC runs. */
+ hash_set<tree>::iterator end = ctx->oacc_private_scalars->end ();
+ hash_set<tree>::iterator it = ctx->oacc_private_scalars->begin ();
+ for (; it != end; ++it)
+ {
+ tree decl = *it;
+ for (omp_context *thisctx = ctx; thisctx; thisctx = thisctx->outer)
+ {
+ tree inner_decl = maybe_lookup_decl (decl, thisctx);
+ if (inner_decl)
+ {
+ decl = inner_decl;
+ break;
+ }
+ }
+
+ args.safe_push (decl);
+ }
+
+ return gimple_build_call_internal_vec (IFN_UNIQUE, args);
+}
+
/* Lower code for an OMP loop directive. */
static void
@@ -11929,11 +12145,16 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
/* Once lowered, extract the bounds and clauses. */
omp_extract_for_data (stmt, &fd, NULL);
- if (is_gimple_omp_oacc (ctx->stmt)
- && !ctx_in_oacc_kernels_region (ctx))
- lower_oacc_head_tail (gimple_location (stmt),
- gimple_omp_for_clauses (stmt), private_marker,
- &oacc_head, &oacc_tail, ctx);
+ bool oacc_kernels_parloops = false;
+ if (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS)
+ oacc_kernels_parloops = ctx_in_oacc_kernels_region (ctx);
+ if (is_gimple_omp_oacc (ctx->stmt) && !oacc_kernels_parloops)
+ {
+ lower_oacc_head_tail (gimple_location (stmt),
+ gimple_omp_for_clauses (stmt), private_marker,
+ NULL, NULL, &oacc_head, &oacc_tail, ctx);
+ }
/* Add OpenACC partitioning and reduction markers just before the loop. */
if (oacc_head)
@@ -12833,6 +13054,7 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
case GF_OMP_TARGET_KIND_OACC_DECLARE:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
data_region = false;
break;
case GF_OMP_TARGET_KIND_DATA:
@@ -13073,8 +13295,6 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
{
/* No 'firstprivate' clauses on OpenACC 'kernels'. */
gcc_checking_assert (!is_oacc_kernels (ctx));
- /* Likewise, on OpenACC 'kernels' decomposed parts. */
- gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
goto oacc_firstprivate;
}
@@ -13107,8 +13327,6 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
{
/* No 'private' clauses on OpenACC 'kernels'. */
gcc_checking_assert (!is_oacc_kernels (ctx));
- /* Likewise, on OpenACC 'kernels' decomposed parts. */
- gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
break;
}
@@ -14259,13 +14477,26 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
gcall *private_marker = make_oacc_private_marker (ctx);
- if (private_marker)
+ gcall *firstprivate_marker = NULL;
+ gcall *private_scalars_marker = NULL;
+
+ /* The markers for "private" and "firstprivate" scalars are only used
+ to help "Graphite" identify those variables for which it has to
+ adjust some dependences. */
+ if (is_oacc_kernels_decomposed_graphite_part (ctx))
+ {
+ firstprivate_marker = make_oacc_firstprivate_vars_marker (ctx);
+ private_scalars_marker = make_oacc_private_scalars_marker (ctx);
+ }
+
+ if (private_marker)
gimple_call_set_arg (private_marker, 2, level);
- lower_oacc_reductions (gimple_location (ctx->stmt), clauses, level,
- false, NULL, private_marker, NULL, &fork_seq,
- &join_seq, ctx);
- }
+ lower_oacc_reductions (gimple_location (ctx->stmt), clauses, level,
+ false, NULL, private_marker,
+ private_scalars_marker, firstprivate_marker,
+ NULL, &fork_seq, &join_seq, ctx);
+ }
gimple_seq_add_seq (&new_body, fork_seq);
gimple_seq_add_seq (&new_body, tgt_body);
@@ -176,8 +176,13 @@ adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
compiler logic to analyze this, so can't parallelize it here, so
we'd very likely be running into a performance problem if we
were to execute this unparallelized, thus forward the whole loop
- nest to 'parloops'. */
- *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ nest to Graphite/"parloops". */
+ if (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE)
+ *region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE;
+ else if (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS)
+ *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ else
+ gcc_unreachable ();
/* Terminate: final decision for this region. */
*handled_ops_p = true;
return integer_zero_node;
@@ -197,8 +202,13 @@ adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
the compiler logic to analyze this, so can't parallelize it here, so
we'd very likely be running into a performance problem if we were to
execute this unparallelized, thus forward the whole thing to
- 'parloops'. */
- *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ Graphite/"parloops". */
+ if (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE)
+ *region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE;
+ else if (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS)
+ *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ else
+ gcc_unreachable ();
/* Terminate: final decision for this region. */
*handled_ops_p = true;
return integer_zero_node;
@@ -309,7 +319,9 @@ make_region_seq (location_t loc, gimple_seq stmts,
/* Figure out the region code for this region. */
/* Optimistic default: assume "setup code", no looping; thus not
performance-critical. */
- int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE;
+ int region_code = param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE
+ ? GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE
+ : GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE;
adjust_region_code (stmts, ®ion_code);
if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
@@ -330,6 +342,13 @@ make_region_seq (location_t loc, gimple_seq stmts,
loops nested inside this sequentially executed statement. */
make_loops_gang_single (stmts);
}
+ else if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, loc_stmts_first,
+ "beginning %<Graphite%> part in OpenACC"
+ " %<kernels%> region\n");
+ }
else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
{
if (dump_enabled_p ())
@@ -437,21 +456,24 @@ adjust_nested_loop_clauses (gimple_stmt_iterator *gsi_p, bool *,
tree *outer_clause_ptr = NULL;
switch (OMP_CLAUSE_CODE (loop_clause))
{
- case OMP_CLAUSE_GANG:
- outer_clause_ptr = wi_info->loop_gang_clause_ptr;
- break;
- case OMP_CLAUSE_WORKER:
- outer_clause_ptr = wi_info->loop_worker_clause_ptr;
- break;
- case OMP_CLAUSE_VECTOR:
- outer_clause_ptr = wi_info->loop_vector_clause_ptr;
- break;
- case OMP_CLAUSE_SEQ:
- case OMP_CLAUSE_INDEPENDENT:
- case OMP_CLAUSE_AUTO:
- add_auto_clause = false;
- default:
- break;
+ case OMP_CLAUSE_GANG:
+ outer_clause_ptr = wi_info->loop_gang_clause_ptr;
+ add_auto_clause = false;
+ break;
+ case OMP_CLAUSE_WORKER:
+ outer_clause_ptr = wi_info->loop_worker_clause_ptr;
+ add_auto_clause = false;
+ break;
+ case OMP_CLAUSE_VECTOR:
+ outer_clause_ptr = wi_info->loop_vector_clause_ptr;
+ add_auto_clause = false;
+ break;
+ case OMP_CLAUSE_SEQ:
+ case OMP_CLAUSE_INDEPENDENT:
+ case OMP_CLAUSE_AUTO:
+ add_auto_clause = false;
+ default:
+ break;
}
if (outer_clause_ptr != NULL)
{
@@ -525,30 +547,34 @@ transform_kernels_loop_clauses (gimple *omp_for,
loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
{
bool found_num_clause = false;
- tree *clause_ptr, clause_to_check;
+ tree *clause_ptr;
+ tree clause_to_check = NULL_TREE;
switch (OMP_CLAUSE_CODE (loop_clause))
- {
- case OMP_CLAUSE_GANG:
- found_num_clause = true;
- clause_ptr = &loop_gang_clause;
- clause_to_check = num_gangs_clause;
- break;
- case OMP_CLAUSE_WORKER:
- found_num_clause = true;
- clause_ptr = &loop_worker_clause;
- clause_to_check = num_workers_clause;
- break;
- case OMP_CLAUSE_VECTOR:
- found_num_clause = true;
- clause_ptr = &loop_vector_clause;
- clause_to_check = vector_length_clause;
- break;
- case OMP_CLAUSE_INDEPENDENT:
- case OMP_CLAUSE_SEQ:
- case OMP_CLAUSE_AUTO:
- add_auto_clause = false;
- default:
- break;
+ {
+ case OMP_CLAUSE_GANG:
+ found_num_clause = true;
+ add_auto_clause = false;
+ clause_ptr = &loop_gang_clause;
+ clause_to_check = num_gangs_clause;
+ break;
+ case OMP_CLAUSE_WORKER:
+ found_num_clause = true;
+ add_auto_clause = false;
+ clause_ptr = &loop_worker_clause;
+ clause_to_check = num_workers_clause;
+ break;
+ case OMP_CLAUSE_VECTOR:
+ found_num_clause = true;
+ add_auto_clause = false;
+ clause_ptr = &loop_vector_clause;
+ clause_to_check = vector_length_clause;
+ break;
+ case OMP_CLAUSE_INDEPENDENT:
+ case OMP_CLAUSE_SEQ:
+ case OMP_CLAUSE_AUTO:
+ add_auto_clause = false;
+ default:
+ break;
}
if (found_num_clause && OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL)
{
@@ -646,10 +672,13 @@ make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
clauses = unshare_expr (clauses);
/* Figure out the region code for this region. */
- /* Optimistic default: assume that the loop nest is parallelizable
- (essentially, no GIMPLE_OMP_FOR with (explicit or implicit) 'auto' clause,
- and no un-annotated loops). */
- int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED;
+ /* For "parloops", use an optimistic default: assume that the loop nest is
+ parallelizable (essentially, no GIMPLE_OMP_FOR with (explicit or implicit)
+ 'auto' clause, and no un-annotated loops). */
+ int region_code = param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE
+ ? GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE
+ : GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED;
+
adjust_region_code (stmts, ®ion_code);
if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
@@ -661,6 +690,19 @@ make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
"parallelized loop nest"
" in OpenACC %<kernels%> region\n");
+ clauses = transform_kernels_loop_clauses (omp_for,
+ num_gangs_clause,
+ num_workers_clause,
+ vector_length_clause,
+ clauses);
+ }
+ else if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, omp_for,
+ "forwarded loop nest in OpenACC %<kernels%> region"
+ " to %<Graphite%> for analysis\n");
+
clauses = transform_kernels_loop_clauses (omp_for,
num_gangs_clause,
num_workers_clause,
@@ -1651,8 +1693,13 @@ public:
/* opt_pass methods: */
virtual bool gate (function *)
{
- return (flag_openacc
- && param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE);
+ if (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE
+ || param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS)
+ return flag_openacc;
+ else if (param_openacc_kernels == OPENACC_KERNELS_PARLOOPS)
+ return false;
+ else
+ gcc_unreachable ();
}
virtual unsigned int execute (function *)
{
@@ -853,6 +853,202 @@ oacc_xform_loop (gcall *call)
gsi_replace_with_seq (&gsi, seq, true);
}
+/* This is used for expanding the loop calls to "fake" values that mimic the
+ values used for host execution during scalar evolution analysis in
+ Graphite. The function has been derived from oacc_xform_loop which could not
+ be used because it rewrites the code directly.
+
+ TODO This function can either be simplified significantly (cf. the fixed
+ values for number_of_threads, thread_index, chunking, striding) or unified
+ with oacc_xform_loop. */
+
+tree
+oacc_extract_loop_call (gcall *call)
+{
+ gimple_stmt_iterator gsi = gsi_for_stmt (call);
+ enum ifn_goacc_loop_kind code
+ = (enum ifn_goacc_loop_kind)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+ tree dir = gimple_call_arg (call, 1);
+ tree range = gimple_call_arg (call, 2);
+ tree step = gimple_call_arg (call, 3);
+ tree chunk_size = NULL_TREE;
+ unsigned mask = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 5));
+ tree lhs = gimple_call_lhs (call);
+ tree type = NULL_TREE;
+ tree diff_type = TREE_TYPE (range);
+ tree r = NULL_TREE;
+ bool chunking = false, striding = true;
+ unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
+ /* unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
+ */
+
+ gcc_checking_assert (lhs);
+
+ type = TREE_TYPE (lhs);
+
+ tree number_of_threads = integer_one_node;
+ tree thread_index = integer_zero_node;
+
+ /* striding=true, chunking=true
+ -> invalid.
+ striding=true, chunking=false
+ -> chunks=1
+ striding=false,chunking=true
+ -> chunks=ceil (range/(chunksize*threads*step))
+ striding=false,chunking=false
+ -> chunk_size=ceil(range/(threads*step)),chunks=1 */
+
+ switch (code)
+ {
+ default:
+ gcc_unreachable ();
+
+ case IFN_GOACC_LOOP_CHUNKS:
+ if (!chunking)
+ r = build_int_cst (type, 1);
+ else
+ {
+ /* chunk_max
+ = (range - dir) / (chunks * step * num_threads) + dir */
+ tree per = number_of_threads;
+ per = fold_convert (type, per);
+ chunk_size = fold_convert (type, chunk_size);
+ per = fold_build2 (MULT_EXPR, type, per, chunk_size);
+ per = fold_build2 (MULT_EXPR, type, per, step);
+ r = fold_build2 (MINUS_EXPR, type, range, dir);
+ r = fold_build2 (PLUS_EXPR, type, r, per);
+ r = fold_build2 (TRUNC_DIV_EXPR, type, r, per);
+ }
+ break;
+
+ case IFN_GOACC_LOOP_STEP:
+ {
+ /* If striding, step by the entire compute volume, otherwise
+ step by the inner volume. */
+ /* unsigned volume = striding ? mask : inner_mask; */
+
+ r = number_of_threads;
+ r = fold_build2 (MULT_EXPR, type, fold_convert (type, r), step);
+ }
+ break;
+
+ case IFN_GOACC_LOOP_OFFSET:
+ /* Enable vectorization on non-SIMT targets. */
+ if (!targetm.simt.vf
+ && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+ /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
+ the loop. */
+ && (flag_tree_loop_vectorize
+ || !global_options_set.x_flag_tree_loop_vectorize))
+ {
+ basic_block bb = gsi_bb (gsi);
+ class loop *parent = bb->loop_father;
+ class loop *body = parent->inner;
+
+ parent->force_vectorize = true;
+ parent->safelen = INT_MAX;
+
+ /* "Chunking loops" may have inner loops. */
+ if (parent->inner)
+ {
+ body->force_vectorize = true;
+ body->safelen = INT_MAX;
+ }
+
+ cfun->has_force_vectorize_loops = true;
+ }
+ if (striding)
+ {
+ r = thread_index;
+ r = fold_convert (diff_type, r);
+ }
+ else
+ {
+ tree inner_size = number_of_threads;
+ tree outer_size = number_of_threads;
+ tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+ inner_size, outer_size);
+
+ volume = fold_convert (diff_type, volume);
+ if (chunking)
+ chunk_size = fold_convert (diff_type, chunk_size);
+ else
+ {
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+
+ chunk_size = fold_build2 (MINUS_EXPR, diff_type, range, dir);
+ chunk_size = fold_build2 (PLUS_EXPR, diff_type, chunk_size, per);
+ chunk_size
+ = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+ }
+
+ tree span = fold_build2 (MULT_EXPR, diff_type, chunk_size,
+ fold_convert (diff_type, inner_size));
+ r = thread_index;
+ r = fold_convert (diff_type, r);
+ r = fold_build2 (MULT_EXPR, diff_type, r, span);
+
+ tree inner = thread_index;
+ inner = fold_convert (diff_type, inner);
+ r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
+
+ if (chunking)
+ {
+ tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
+ tree per
+ = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
+ per = fold_build2 (MULT_EXPR, diff_type, per, chunk);
+
+ r = fold_build2 (PLUS_EXPR, diff_type, r, per);
+ }
+ }
+ r = fold_build2 (MULT_EXPR, diff_type, r, step);
+ if (type != diff_type)
+ r = fold_convert (type, r);
+ break;
+
+ case IFN_GOACC_LOOP_BOUND:
+ if (striding)
+ r = range;
+ else
+ {
+ tree inner_size = number_of_threads;
+ tree outer_size = number_of_threads;
+ tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+ inner_size, outer_size);
+
+ volume = fold_convert (diff_type, volume);
+ if (chunking)
+ chunk_size = fold_convert (diff_type, chunk_size);
+ else
+ {
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+
+ chunk_size = fold_build2 (MINUS_EXPR, diff_type, range, dir);
+ chunk_size = fold_build2 (PLUS_EXPR, diff_type, chunk_size, per);
+ chunk_size
+ = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+ }
+
+ tree span = fold_build2 (MULT_EXPR, diff_type, chunk_size,
+ fold_convert (diff_type, inner_size));
+
+ r = fold_build2 (MULT_EXPR, diff_type, span, step);
+
+ tree offset = gimple_call_arg (call, 6);
+ r = fold_build2 (PLUS_EXPR, diff_type, r,
+ fold_convert (diff_type, offset));
+ r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type,
+ r, range);
+ }
+ if (diff_type != type)
+ r = fold_convert (type, r);
+ break;
+ }
+
+ return r;
+}
+
/* Transform a GOACC_TILE call. Determines the element loop span for
the specified loop of the nest. This is 1 if we're not tiling.
@@ -1050,7 +1246,8 @@ oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
#endif
if (check
&& warn_openacc_parallelism
- && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn)))
+ && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn))
+ && !lookup_attribute ("oacc parallel_kernels_graphite", DECL_ATTRIBUTES (fn)))
{
static char const *const axes[] =
/* Must be kept in sync with GOMP_DIM enumeration. */
@@ -1550,7 +1747,219 @@ oacc_loop_process (oacc_loop *loop)
oacc_loop_process (loop->sibling);
}
-/* Walk the OpenACC loop heirarchy checking and assigning the
+/* Return the outermost CFG loop that is enclosed between the head and
+ tail mark calls for LOOP, or NULL if there is no such CFG loop.
+
+ The outermost CFG loop is a loop that is used for "chunking" the
+ original loop from the user's code. The lower_omp_for function
+ in omp-low.c which creates the head and tail mark sequence and
+ the expand_oacc_for function in omp-expand.c are relevant for
+ understanding the structure that we expect to find here. But note
+ that the passes implemented in those files do not operate on CFG
+ loops and hence the correspondence to the CFG loop structure is
+ not directly visible there and has to be inferred. */
+
+static loop_p
+oacc_loop_get_cfg_loop (oacc_loop *loop)
+{
+ loop_p enclosed_cfg_loop = NULL;
+ for (unsigned dim = 0; dim < GOMP_DIM_MAX; ++dim)
+ {
+ gcall *tail_mark = loop->tails[dim];
+ gimple *head_mark = loop->heads[dim];
+ if (!tail_mark)
+ continue;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_printf (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_INTERNALS, "%G",
+ tail_mark);
+
+ loop_p mark_cfg_loop = tail_mark->bb->loop_father;
+ loop_p current_cfg_loop = mark_cfg_loop;
+
+ /* Ascend from TAIL_MARK until a different CFG loop is reached.
+
+ From the way that OpenACC loops are treated in omp-low.c, we
+ could expect the tail marker to be immediately preceded by a
+ loop exit. But loop optimizations (e.g. store-motion in
+ pass_lim) can change this. */
+ basic_block bb = tail_mark->bb;
+ bool empty_loop = false;
+ while (current_cfg_loop == mark_cfg_loop)
+ {
+ /* If the OpenACC loop becomes empty due to optimizations,
+ there is no CFG loop at all enclosed between head and
+ tail mark */
+ if (bb == head_mark->bb)
+ {
+ empty_loop = true;
+ break;
+ }
+
+ bb = get_immediate_dominator (CDI_DOMINATORS, bb);
+ current_cfg_loop = bb->loop_father;
+ }
+
+ if (empty_loop)
+ continue;
+
+ /* We expect to find the same CFG loop enclosed between all head
+ and tail mark pairs. Hence we actually need to look at only
+ the first available pair. But we consider all for
+ verification purposes. */
+ if (enclosed_cfg_loop)
+ {
+ gcc_assert (current_cfg_loop == enclosed_cfg_loop);
+ continue;
+ }
+
+ enclosed_cfg_loop = current_cfg_loop;
+
+ gcc_checking_assert (dominated_by_p (
+ CDI_DOMINATORS, enclosed_cfg_loop->header, head_mark->bb));
+ }
+
+ return enclosed_cfg_loop;
+}
+
+static const char*
+can_be_parallel_str (loop_p loop)
+{
+ if (!loop->can_be_parallel_valid_p)
+ return "not analyzed";
+
+ return loop->can_be_parallel ? "can be parallel" : "cannot be parallel";
+}
+
+/* Returns true if LOOP is known to be parallelizable and false
+ otherwise. The decision is based on the the dependence analysis
+ that must have been previously performed by Graphite on the CFG
+ loops contained in the OpenACC loop LOOP. The value of ANALYZED is
+ set to true if all relevant CFG loops have been analyzed. */
+
+static bool
+oacc_loop_can_be_parallel_p (oacc_loop *loop, bool& analyzed)
+{
+ /* Graphite will not run without enabled optimizations, so we cannot
+ expect to find any parallelizability information on the CFG loops. */
+ if (!optimize)
+ return false;
+
+ const dump_user_location_t loc
+ = dump_user_location_t::from_location_t (loop->loc);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_INTERNALS, loc,
+ "Inspecting CFG-loops for OpenACC loop.\n");
+
+ /* Search for the CFG loops that are enclosed between the head and
+ tail mark calls for LOOP. The two outer CFG loops are considered
+ to belong to the OpenACC loop and hence the CAN_BE_PARALLEL flags
+ on those loops will be used to determine the return value. */
+ bool can_be_parallel = false;
+ loop_p enclosed_cfg_loop = oacc_loop_get_cfg_loop (loop);
+
+ if (enclosed_cfg_loop
+ /* The inner loop may have been removed in degenerate cases, e.g.
+ if an infinite "for (; ;)" gets optimized in an OpenACC loop nest. */
+ && enclosed_cfg_loop->inner)
+ {
+ gcc_assert (enclosed_cfg_loop->inner != NULL);
+ gcc_assert (enclosed_cfg_loop->inner->next == NULL);
+
+ can_be_parallel = enclosed_cfg_loop->can_be_parallel
+ && enclosed_cfg_loop->inner->can_be_parallel;
+
+ analyzed = enclosed_cfg_loop->can_be_parallel_valid_p
+ && enclosed_cfg_loop->inner->can_be_parallel_valid_p;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ dump_printf (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_INTERNALS,
+ "\tOuter loop <%d> preceeding tail mark %s.\n"
+ "\tInner loop <%d> %s.\n",
+ enclosed_cfg_loop->num,
+ can_be_parallel_str (enclosed_cfg_loop),
+ enclosed_cfg_loop->inner->num,
+ can_be_parallel_str (enclosed_cfg_loop->inner));
+ }
+ }
+ else if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_INTERNALS, loc,
+ "Empty OpenACC loop.\n");
+
+ return can_be_parallel;
+}
+
+static bool
+oacc_parallel_kernels_graphite_fun_p ()
+{
+ return lookup_attribute ("oacc parallel_kernels_graphite",
+ DECL_ATTRIBUTES (cfun->decl));
+}
+
+static bool
+oacc_parallel_fun_p ()
+{
+ return lookup_attribute ("oacc parallel",
+ DECL_ATTRIBUTES (cfun->decl));
+}
+
+/* If LOOP is an "auto" loop for which dependence analysis has determined that
+ it can be parallelized, make it "independent" by adjusting its FLAGS field
+ and return true. Otherwise, return false. */
+
+static bool
+oacc_loop_transform_auto_into_independent (oacc_loop *loop)
+{
+ if (!optimize)
+ return false;
+
+ /* This function is only relevant on "kernels"
+ regions that have been explicitly designated
+ to be analyzed by Graphite and on "auto"
+ loops in "parallel" regions. */
+ if (!oacc_parallel_kernels_graphite_fun_p () &&
+ !oacc_parallel_fun_p ())
+ return false;
+
+ if (loop->routine)
+ return false;
+
+ if (!(loop->flags & OLF_AUTO))
+ return false;
+
+ bool analyzed = false;
+ bool can_be_parallel = oacc_loop_can_be_parallel_p (loop, analyzed);
+ dump_user_location_t loc = dump_user_location_t::from_location_t (loop->loc);
+
+ if (dump_enabled_p ())
+ {
+ if (!analyzed)
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
+ "'auto' loop has not been analyzed (cf. 'graphite' "
+ "dumps for more information).\n");
+ }
+ if (!can_be_parallel)
+ return false;
+
+ loop->flags |= OLF_INDEPENDENT;
+
+ /* We need to keep the OLF_AUTO flag for now.
+ oacc_loop_fixed_partitions and oacc_loop_auto_partitions
+ interpret "independent auto" as "this loop can be parallel,
+ please determine the dimensions" which seems to correspond to the
+ meaning of those clauses in an old OpenACC version. We rely on
+ this behaviour to assign the dimensions for this loop.
+
+ TODO Use a different flag to indicate that the dimensions must be assigned. */
+
+ // loop->flags &= ~OLF_AUTO;
+
+ return true;
+}
+
+/* Walk the OpenACC loop hierarchy checking and assigning the
programmer-specified partitionings. OUTER_MASK is the partitioning
this loop is contained within. Return mask of partitioning
encountered. If any auto loops are discovered, set GOMP_DIM_MAX
@@ -1606,6 +2015,9 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
loop->flags |= OLF_AUTO;
mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
}
+
+ if (oacc_loop_transform_auto_into_independent (loop))
+ mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
}
if (this_mask & outer_mask)
@@ -2077,81 +2489,88 @@ execute_oacc_loop_designation ()
flag_openacc_dims = (char *)&flag_openacc_dims;
}
- bool is_oacc_parallel
- = (lookup_attribute ("oacc parallel",
- DECL_ATTRIBUTES (current_function_decl)) != NULL);
bool is_oacc_kernels
= (lookup_attribute ("oacc kernels",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
+ bool is_oacc_parallel
+ = (lookup_attribute ("oacc parallel",
+ DECL_ATTRIBUTES (current_function_decl)) != NULL);
bool is_oacc_serial
= (lookup_attribute ("oacc serial",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
- bool is_oacc_parallel_kernels_parallelized
- = (lookup_attribute ("oacc parallel_kernels_parallelized",
- DECL_ATTRIBUTES (current_function_decl)) != NULL);
- bool is_oacc_parallel_kernels_gang_single
- = (lookup_attribute ("oacc parallel_kernels_gang_single",
- DECL_ATTRIBUTES (current_function_decl)) != NULL);
- int fn_level = oacc_fn_attrib_level (attr);
- bool is_oacc_routine = (fn_level >= 0);
- gcc_checking_assert (is_oacc_parallel
- + is_oacc_kernels
- + is_oacc_serial
- + is_oacc_parallel_kernels_parallelized
- + is_oacc_parallel_kernels_gang_single
- + is_oacc_routine
- == 1);
-
bool is_oacc_kernels_parallelized
= (lookup_attribute ("oacc kernels parallelized",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
if (is_oacc_kernels_parallelized)
gcc_checking_assert (is_oacc_kernels);
+ bool is_oacc_parallel_kernels_parallelized
+ = (lookup_attribute ("oacc parallel_kernels_parallelized",
+ DECL_ATTRIBUTES (current_function_decl))
+ != NULL);
+ if (is_oacc_parallel_kernels_parallelized)
+ gcc_checking_assert (!is_oacc_kernels);
+ bool is_oacc_parallel_kernels_gang_single
+ = (lookup_attribute ("oacc parallel_kernels_gang_single",
+ DECL_ATTRIBUTES (current_function_decl)) != NULL);
+ if (is_oacc_parallel_kernels_gang_single)
+ gcc_checking_assert (!is_oacc_kernels);
+ gcc_checking_assert (!(is_oacc_parallel_kernels_parallelized
+ && is_oacc_parallel_kernels_gang_single));
+ bool is_oacc_parallel_kernels_graphite
+ = (lookup_attribute ("oacc parallel_kernels_graphite",
+ DECL_ATTRIBUTES (current_function_decl)) != NULL);
+ if (is_oacc_parallel_kernels_graphite)
+ gcc_checking_assert (!is_oacc_kernels
+ && !is_oacc_parallel_kernels_gang_single);
+
+ /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
+ kernels, so remove the parallelism dimensions function attributes
+ potentially set earlier on. */
+ if (is_oacc_kernels && !is_oacc_kernels_parallelized)
+ {
+ gcc_checking_assert (!is_oacc_parallel_kernels_graphite);
+ oacc_set_fn_attrib (current_function_decl, NULL, NULL);
+ attr = oacc_get_fn_attrib (current_function_decl);
+ }
+
+ /* Discover, partition and process the loops. */
+ oacc_loop *loops = oacc_loop_discovery ();
+ int fn_level = oacc_fn_attrib_level (attr);
if (dump_file)
{
- if (is_oacc_parallel)
- fprintf (dump_file, "Function is OpenACC parallel offload\n");
+ if (fn_level >= 0)
+ fprintf (dump_file, "Function is OpenACC routine level %d\n",
+ fn_level);
else if (is_oacc_kernels)
fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
(is_oacc_kernels_parallelized
? "parallelized" : "unparallelized"));
- else if (is_oacc_serial)
- fprintf (dump_file, "Function is OpenACC serial offload\n");
else if (is_oacc_parallel_kernels_parallelized)
fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
"parallel_kernels_parallelized");
else if (is_oacc_parallel_kernels_gang_single)
fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
"parallel_kernels_gang_single");
- else if (is_oacc_routine)
- fprintf (dump_file, "Function is OpenACC routine level %d\n",
- fn_level);
+ else if (is_oacc_parallel_kernels_graphite)
+ fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
+ "parallel_kernels_graphite");
+ else if (is_oacc_serial)
+ fprintf (dump_file, "Function is OpenACC serial offload\n");
+ else if (is_oacc_parallel)
+ fprintf (dump_file, "Function is OpenACC parallel offload\n");
else
gcc_unreachable ();
}
- /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
- kernels, so remove the parallelism dimensions function attributes
- potentially set earlier on. */
- if (is_oacc_kernels && !is_oacc_kernels_parallelized)
- {
- oacc_set_fn_attrib (current_function_decl, NULL, NULL);
- attr = oacc_get_fn_attrib (current_function_decl);
- }
-
- /* Discover, partition and process the loops. */
- oacc_loop *loops = oacc_loop_discovery ();
- fn_level = oacc_fn_attrib_level (attr);
-
- unsigned outer_mask = 0;
- if (is_oacc_routine)
- outer_mask = GOMP_DIM_MASK (fn_level) - 1;
+ unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
unsigned used_mask = oacc_loop_partition (loops, outer_mask);
/* OpenACC kernels constructs are special: they currently don't use the
generic oacc_loop infrastructure and attribute/dimension processing. */
if (is_oacc_kernels && is_oacc_kernels_parallelized)
{
+ gcc_checking_assert (!is_oacc_parallel_kernels_graphite);
+
/* Parallelized OpenACC kernels constructs use gang parallelism. See
also tree-parloops.c:create_parallel_loop. */
used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
@@ -2410,6 +2829,11 @@ execute_oacc_device_lower ()
remove = true;
break;
+ case IFN_UNIQUE_OACC_PRIVATE_SCALAR:
+ case IFN_UNIQUE_OACC_FIRSTPRIVATE:
+ remove = true;
+ break;
+
case IFN_UNIQUE_OACC_PRIVATE:
{
HOST_WIDE_INT level
@@ -32,5 +32,7 @@ extern GTY(()) vec<tree, va_gc> *offload_vars;
extern int oacc_fn_attrib_level (tree attr);
extern void omp_finish_file (void);
extern void omp_discover_implicit_declare_target (void);
+extern tree oacc_extract_loop_call (gcall *call);
+
#endif /* GCC_OMP_DEVICE_H */
@@ -788,7 +788,7 @@ If -ftree-vectorize is used, the minimal loop bound of a loop to be considered f
-param=openacc-kernels=
Common Joined Enum(openacc_kernels) Var(param_openacc_kernels) Init(OPENACC_KERNELS_DECOMPOSE) Param
---param=openacc-kernels=[decompose|parloops] Specify mode of OpenACC 'kernels' constructs handling.
+--param=openacc-kernels=[decompose|decompose-parloops|parloops] Specify mode of OpenACC 'kernels' constructs handling.
Enum
Name(openacc_kernels) Type(enum openacc_kernels)
@@ -796,6 +796,9 @@ Name(openacc_kernels) Type(enum openacc_kernels)
EnumValue
Enum(openacc_kernels) String(decompose) Value(OPENACC_KERNELS_DECOMPOSE)
+EnumValue
+Enum(openacc_kernels) String(decompose-parloops) Value(OPENACC_KERNELS_DECOMPOSE_PARLOOPS)
+
EnumValue
Enum(openacc_kernels) String(parloops) Value(OPENACC_KERNELS_PARLOOPS)
@@ -448,8 +448,29 @@ scalar_evolution_in_region (const sese_l ®ion, loop_p loop, tree t)
if (!loop_in_sese_p (loop, region))
loop = NULL;
- return instantiate_scev (region.entry, loop,
- analyze_scalar_evolution (loop, t));
+ tree chrec = analyze_scalar_evolution (loop, t);
+
+ /* The IFN_GOACC_LOOP calls may evolve to an ssa name that is defined outside
+ of LOOP. To avoid failing the scev analysis, we need this special
+ handling. */
+ if (TREE_CODE (t) == SSA_NAME)
+ {
+ gimple *def_stmt = SSA_NAME_DEF_STMT (t);
+ basic_block def_bb = def_stmt->bb;
+ if (is_gimple_call (def_stmt)
+ && gimple_call_internal_p (def_stmt, IFN_GOACC_LOOP)
+ && TREE_CODE (chrec) == SSA_NAME && def_bb
+ && SSA_NAME_DEF_STMT (chrec)->bb)
+ {
+ loop_p outer_loop = SSA_NAME_DEF_STMT (chrec)->bb->loop_father;
+ loop_p inner_loop = def_bb->loop_father;
+
+ if (outer_loop != inner_loop)
+ return scalar_evolution_in_region (region, outer_loop, chrec);
+ }
+ }
+
+ return instantiate_scev (region.entry, loop, chrec);
}
/* Return true if BB is empty, contains only DEBUG_INSNs. */
@@ -280,6 +280,7 @@ typedef struct gimple_poly_bb
vec<data_reference_p> data_refs;
vec<scalar_use> read_scalar_refs;
vec<tree> write_scalar_refs;
+ vec<tree> kill_scalar_refs;
} *gimple_poly_bb_p;
#define GBB_BB(GBB) (GBB)->bb
deleted file mode 100644
@@ -1,45 +0,0 @@
-/* Check offloaded function's attributes and classification for unparallelized
- OpenACC 'kernels'. */
-
-/* { dg-additional-options "-O2" }
- { dg-additional-options "-fno-openacc-kernels-annotate-loops" }
- { dg-additional-options "-fopt-info-note-optimized-omp" }
- { dg-additional-options "-fdump-tree-ompexp" }
- { dg-additional-options "-fdump-tree-parloops1-all" }
- { dg-additional-options "-fdump-tree-oaccloops1" } */
-
-/* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
- aspects of that functionality. */
-
-#define N 1024
-
-extern unsigned int *__restrict a;
-extern unsigned int *__restrict b;
-extern unsigned int *__restrict c;
-
-extern unsigned int f (unsigned int);
-#pragma acc routine (f) seq
-
-void KERNELS ()
-{
-#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
- for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
- /* An "extern"al mapping of loop iterations/array indices makes the loop
- unparallelizable. */
- c[i] = a[f (i)] + b[f (i)];
-}
-
-/* Check the offloaded function's attributes.
- { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels, omp target entrypoint\\)\\)" 1 "ompexp" } } */
-
-/* Check that exactly one OpenACC kernels construct is analyzed, and that it
- can't be parallelized.
- { dg-final { scan-tree-dump-times "FAILED:" 1 "parloops1" } }
- { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } }
- { dg-final { scan-tree-dump-not "SUCCESS: may be parallelized" "parloops1" } } */
-
-/* Check the offloaded function's classification and compute dimensions (will
- always be 1 x 1 x 1 for non-offloading compilation).
- { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops1" } }
- { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } }
- { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops1" } } */
@@ -20,7 +20,7 @@ extern unsigned int *__restrict c;
void KERNELS ()
{
#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
- for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
c[i] = a[i] + b[i];
}
deleted file mode 100644
@@ -1,36 +0,0 @@
-/* { dg-additional-options "--param=openacc-kernels=parloops" } as this is
- specifically testing "parloops" handling. */
-/* { dg-additional-options "-O2" } */
-/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */
-/* { dg-additional-options "-fdump-tree-parloops1-all" } */
-/* { dg-additional-options "-fdump-tree-optimized" } */
-
-#include <stdlib.h>
-
-#define n 10000
-
-unsigned int a[n];
-
-void __attribute__((noinline,noclone))
-foo (void)
-{
- int i;
- unsigned int sum = 1;
-
-#pragma acc kernels copyin (a[0:n]) copy (sum)
- {
- for (i = 0; i < n; ++i)
- sum += a[i];
- }
-
- if (sum != 5001)
- abort ();
-}
-
-/* Check that only one loop is analyzed, and that it can be parallelized. */
-/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
-/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint, noclone, noinline\\)\\)" 1 "parloops1" } } */
-/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
-
-/* Check that the loop has been split off into a function. */
-/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
@@ -16,7 +16,7 @@ main ()
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
/* Strangely indented to keep this similar to other test cases. */
- if (c) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ if (c) /* { dg-message "optimized: beginning .Graphite. region in OpenACC .kernels. construct" } */
{
#pragma acc loop seq
/* { dg-message "missed: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
@@ -2,7 +2,7 @@
construct containing loops. */
/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */
-/* { dg-additional-options "-fopt-info-note-optimized-omp" } */
+/* { dg-additional-options "-fopt-info-optimized-omp-note" } */
//TODO update accordingly
/* See also "../../gfortran.dg/goacc/note-parallelism.f90". */
@@ -15,7 +15,7 @@ main ()
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
/* Strangely indented to keep this similar to other test cases. */
{
- for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
;
for (x = 0; x < 10; x++)
@@ -13,36 +13,36 @@ main ()
int x, y, z;
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
- for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
;
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
- for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
;
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
- for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
for (y = 0; y < 10; y++)
for (z = 0; z < 10; z++)
;
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
- for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
;
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
- for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
for (y = 0; y < 10; y++)
;
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
- for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
for (y = 0; y < 10; y++)
for (z = 0; z < 10; z++)
;
#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
- for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .Graphite. region in OpenACC .kernels. construct" } */
for (y = 0; y < 10; y++)
for (z = 0; z < 10; z++)
;
new file mode 100644
@@ -0,0 +1,47 @@
+! Verify that Graphite's analysis of the CFG loops gets correctly
+! transferred to the OpenACC loop structure for loop-nests of depth 1
+
+! { dg-additional-options "-fdump-tree-graphite-details -fdump-tree-oaccloops1-details -fopt-info-optimized -fopt-info-missed" }
+! { dg-additional-options "--param max-isl-operations=0" }
+! { dg-additional-options "-O2" }
+! { dg-prune-output ".*not inlinable.*" }
+
+module test_module
+
+ real, allocatable :: array1(:)
+ real, allocatable :: array2(:)
+
+ contains
+
+subroutine test_loop_nest_depth_1 ()
+ implicit none
+
+ integer :: i,n
+
+ if (size (array1) /= size (array2)) return
+ n = size(array1)
+
+ !$acc parallel loop auto copy(array1, array2) ! { dg-message "assigned OpenACC gang vector loop parallelism" }
+ ! { dg-message "loop has no data-dependences" "" {target *-*-*} .-1 }
+ ! { dg-message ".auto. loop can be parallel" "" {target *-*-*} .-2 }
+ do i=1, n
+ array2(i) = array1(i) ! { dg-message "loop has no data-dependences" }
+ end do
+
+
+ !$acc parallel loop auto copy(array1, array2) ! { dg-message "assigned OpenACC seq loop parallelism" }
+ ! { dg-message "loop has no data-dependences" "" {target *-*-*} .-1 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-2 }
+ do i=1, n-1
+ array1(i+1) = array1(i) + 10 ! { dg-message "loop has data-dependences" }
+ array2(i) = array1(i)
+ end do
+
+ return
+end subroutine test_loop_nest_depth_1
+
+
+
+end module test_module
+
+! { dg-final { scan-tree-dump-times "number of SCoPs: 1" 2 "graphite" } }
new file mode 100644
@@ -0,0 +1,103 @@
+! Verify that Graphite's analysis of the CFG loops gets correctly
+! transferred to the OpenACC loop structure for loop-nests of depth 2
+
+! { dg-additional-options "-fdump-tree-graphite-details -fdump-tree-oaccloops1-details" }
+! { dg-additional-options "-fopt-info-optimized -fopt-info-missed" }
+! { dg-additional-options "-O2" }
+! { dg-prune-output ".*not inlinable.*" }
+
+module test_module
+ implicit none
+
+ integer, parameter :: n = 100
+ integer, parameter :: m = 100
+
+contains
+
+ subroutine test_loop_nest_depth_2 (array)
+ integer :: i, j
+ real :: array (2, n, m)
+
+ ! Perfect loop-nest, inner and outer loop can be parallel
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang worker loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, m
+ array (1, i, j) = array(2, i, j) ! { dg-message "loop has no data-dependences" }
+ end do
+ end do
+ !$acc end parallel
+
+ ! Imperfect loop-nest, inner and outer loop can be parallel
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang worker loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ array (2, i, n) = array(1, i, n) ! { dg-message "loop has no data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, m
+ array (1, i, j) = array (2, i,j) ! { dg-message "loop has no data-dependences" }
+ end do
+ end do
+ !$acc end parallel
+
+ ! Imperfect loop-nest, inner loop can be parallel, outer loop cannot be parallel
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do i=1, n-1
+ array (1, i+1, 1) = array (2, i, 1) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, m
+ array (1, i, j) = array (2, i, j) ! { dg-message "loop has no data-dependences" }
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest, inner loop can be parallel, outer loop cannot be parallel
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ array (2, i, n) = array (1, i, n) ! { dg-message "loop has no data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do j=1, m-1
+ array (1, i, j+1) = array (1, i, j) ! { dg-message "loop has data-dependences" }
+ end do
+ end do
+ !$acc end parallel
+ return
+ end subroutine test_loop_nest_depth_2
+
+end module test_module
+
+
+! { dg-final { scan-tree-dump-times "number of SCoPs: 1" 4 "graphite" } } One function per kernel, all should be analyzed
+! { dg-final { scan-tree-dump-times "number of SCoPs: 0" 1 "graphite" } } Original function should not be analyzed
new file mode 100644
@@ -0,0 +1,323 @@
+! Verify that Graphite's analysis of the CFG loops gets correctly
+! transferred to the OpenACC loop structure for loop-nests of depth 3
+
+! { dg-additional-options "-fdump-tree-graphite-details -fdump-tree-oaccloops1-details" }
+! { dg-additional-options "-fopt-info-optimized -fopt-info-missed" }
+! { dg-additional-options "-O2" }
+! { dg-prune-output ".*not inlinable.*" }
+
+module test_module
+ implicit none
+
+ integer, parameter :: n = 100
+
+contains
+
+ subroutine test_loop_nest_depth_3 (array)
+ integer :: i, j, k
+ real :: array (2, n, n, n)
+
+ ! Perfect loop-nest. Can be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC worker loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do k=1, n
+ array (1, i, j, k) = array(2, i, j, k) ! { dg-message "loop has no data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+ ! Perfect loop-nest. Innermost loop cannot be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang worker loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do k=1, n-1
+ array (1, i, j, k+1) = array(1, i, j, k) ! { dg-message "loop has data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Perfect loop-nest. Cannot be parallel because it contains no
+ ! data-reference and is hence not analyzed by Graphite. This is
+ ! expected: empty loops should not be parallel either cf. e.g.
+ ! "../../gfortran.dg/goacc/note-parallelism.f90".
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-missed ".auto. loop has not been analyzed .cf. .graphite. dumps for more information.." "" {target *-*-*} .-2 }
+ do i=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-missed ".auto. loop has not been analyzed .cf. .graphite. dumps for more information.." "" {target *-*-*} .-2 }
+ do j=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-bogus "loop has no data-dependences" "OpenACC internal chunking CFG loop not analyzed" {target *-*-*} .-2 }
+ ! { dg-missed ".auto. loop has not been analyzed .cf. .graphite. dumps for more information.." "" {target *-*-*} .-3 }
+ do k=1, n
+ array (1, i, j, k) = array(1, i, j, k) ! { dg-bogus "loop has no data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest. All levels can be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ array (2, i, n, n) = array (1, i, n, n) ! { dg-message "loop has no data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC worker loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, n-1
+ array (2, i, j, n) = array (1, i, j, n) ! { dg-message "loop has no data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do k=1, n-1
+ array (2, i, j, k) = array(1, i, j, k) ! { dg-message "loop has no data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest. First level can be parallel, second level
+ ! can be parallel, third level cannot be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang worker loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ array (2, i, n, n) = array (1, i, n, n) ! { dg-message "loop has no data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, n-1
+ array (2, i, j, n) = array (1, i, j, n) ! { dg-message "loop has no data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do k=1, n-1
+ array (1, i, j, k+1) = array(1, i, j, k) ! { dg-message "loop has data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest. First level can be parallel, second level
+ ! cannot be parallel, third level can be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang worker loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ array (2, i, n, n) = array (1, i, n, n) ! { dg-message "loop has no data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do j=1, n-1
+ array (1, i, j+1, n) = array (1, i, j, n) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do k=1, n-1
+ array (2, i, j, k) = array(1, i, j, k) ! { dg-message "loop has no data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest. First level can be parallel, second and
+ ! third level cannot be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do i=1, n
+ array (2, i, n, n) = array (1, i, n, n) ! { dg-message "loop has no data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do j=1, n-1
+ array (1, i, j+1, n) = array (1, i, j, n) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do k=1, n-1
+ array (1, i, j, k+1) = array(1, i, j, k) ! { dg-message "loop has data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest. First level cannot be parallel, second and
+ ! third levels can be parallel
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do i=1, n - 1
+ array (1, i+1, 1, 1) = array (1, i, 1, 1) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang worker loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do k=1, n
+ array (1, i, j, k) = array(2, i, j, k) ! { dg-message "loop has no data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest. First level cannot be parallel, second
+ ! level can be parallel, third level cannot be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do i=1, n - 1
+ array (1, i+1, 1, 1) = array (1, i, 1, 1) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do j=1, n
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do k=1, n - 1
+ array (1, i, j, k+1) = array(1, i, j, k) ! { dg-message "loop has data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest. First level cannot be parallel, second
+ ! level cannot be parallel, third level can be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do i=1, n - 1
+ array (1, i+1, 1, 1) = array (1, i, 1, 1) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do j=1, n - 1
+ array (1, i, j+1, 1) = array (1, i, j, 1) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC gang vector loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop can be parallel" "" {target *-*-*} .-3 }
+ do k=1, n
+ array (1, i, j, k) = array(2, i, j, k) ! { dg-message "loop has no data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+
+ ! Imperfect loop-nest. All levels cannot be parallel.
+
+ !$acc parallel copy(array)
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do i=1, n-1
+ array (1, i+1, 1, 1) = array (1, i, 1, 1) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do j=1, n-1
+ array (1, i, j+1, 1) = array (1, i, j, 1) ! { dg-message "loop has data-dependences" }
+ !$acc loop auto
+ ! { dg-message "assigned OpenACC seq loop parallelism" "" {target *-*-*} .-1 }
+ ! { dg-message "loop has no data-dependences" "OpenACC internal chunking CFG loop can be parallel" {target *-*-*} .-2 }
+ ! { dg-message "'auto' loop cannot be parallel" "" {target *-*-*} .-3 }
+ do k=1, n-1
+ array (1, i, j, k+1) = array(1, i, j, k) ! { dg-message "loop has data-dependences" }
+ end do
+ end do
+ end do
+ !$acc end parallel
+
+ return
+ end subroutine test_loop_nest_depth_3
+
+end module test_module
+
+
+! Outlined functions for all kernels but the one without data-references should be analyzed.
+! { dg-final { scan-tree-dump-times "number of SCoPs: 1" 10 "graphite" } }
+! Original test functon and one outlined kernel function should not be analyzed
+! { dg-final { scan-tree-dump-times "number of SCoPs: 0" 2 "graphite" } }
@@ -249,6 +249,7 @@ chrec_fold_plus_1 (enum tree_code code, tree type,
return chrec_fold_plus_poly_poly (code, type, op0, op1);
CASE_CONVERT:
+ case VIEW_CONVERT_EXPR:
{
/* We can strip sign-conversions to signed by performing the
operation in unsigned. */
@@ -282,6 +283,7 @@ chrec_fold_plus_1 (enum tree_code code, tree type,
}
CASE_CONVERT:
+ case VIEW_CONVERT_EXPR:
{
/* We can strip sign-conversions to signed by performing the
operation in unsigned. */
@@ -323,6 +325,7 @@ chrec_fold_plus_1 (enum tree_code code, tree type,
: build_int_cst_type (type, -1)));
CASE_CONVERT:
+ case VIEW_CONVERT_EXPR:
if (tree_contains_chrecs (op1, NULL))
return chrec_dont_know;
/* FALLTHRU */
@@ -99,6 +99,8 @@ along with GCC; see the file COPYING3. If not see
#include "internal-fn.h"
#include "range-op.h"
#include "vr-values.h"
+#include "print-tree.h"
+#include "graphite-oacc.h"
static struct datadep_stats
{
@@ -227,7 +229,10 @@ dump_data_reference (FILE *outf,
print_generic_stmt (outf, DR_REF (dr));
fprintf (outf, "# base_object: ");
print_generic_stmt (outf, DR_BASE_OBJECT (dr));
-
+ fprintf (outf, "# base_address: ");
+ print_generic_stmt (outf, DR_BASE_ADDRESS (dr));
+ fprintf (outf, "# loop-invariant offset: ");
+ print_generic_stmt (outf, DR_OFFSET (dr));
for (i = 0; i < DR_NUM_DIMENSIONS (dr); i++)
{
fprintf (outf, "# Access function %d: ", i);
@@ -5833,9 +5838,13 @@ get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references)
if (gimple_call_internal_p (stmt))
switch (gimple_call_internal_fn (stmt))
{
- case IFN_GOMP_SIMD_LANE:
- {
- class loop *loop = gimple_bb (stmt)->loop_father;
+ case IFN_UNIQUE:
+ case IFN_GOACC_REDUCTION:
+ case IFN_GOACC_LOOP:
+ return false;
+ case IFN_GOMP_SIMD_LANE:
+ {
+ class loop *loop = gimple_bb (stmt)->loop_father;
tree uid = gimple_call_arg (stmt, 0);
gcc_assert (TREE_CODE (uid) == SSA_NAME);
if (loop == NULL
@@ -6014,7 +6023,6 @@ graphite_find_data_references_in_stmt (edge nest, loop_p loop, gimple *stmt,
unsigned i;
auto_vec<data_ref_loc, 2> references;
data_ref_loc *ref;
- bool ret = true;
data_reference_p dr;
if (get_references_in_stmt (stmt, &references))
@@ -6028,7 +6036,7 @@ graphite_find_data_references_in_stmt (edge nest, loop_p loop, gimple *stmt,
datarefs->safe_push (dr);
}
- return ret;
+ return true;
}
/* Search the data references in LOOP, and record the information into
@@ -4174,7 +4174,16 @@ public:
virtual bool gate (function *)
{
if (oacc_kernels_p)
- return flag_openacc;
+ {
+ if (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE)
+ return false;
+
+ gcc_checking_assert (
+ param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+ return flag_openacc;
+ }
else
return flag_tree_parallelize_loops > 1;
}
@@ -4193,6 +4202,13 @@ public:
unsigned
pass_parallelize_loops::execute (function *fun)
{
+ if (oacc_kernels_p)
+ {
+ gcc_checking_assert (
+ param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+ }
+
tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
if (nthreads == NULL_TREE)
return 0;
@@ -264,6 +264,8 @@ along with GCC; see the file COPYING3. If not see
#include "gimple.h"
#include "ssa.h"
#include "gimple-pretty-print.h"
+#include "tree-pretty-print.h"
+#include "print-tree.h"
#include "fold-const.h"
#include "gimplify.h"
#include "gimple-iterator.h"
@@ -276,6 +278,8 @@ along with GCC; see the file COPYING3. If not see
#include "tree-ssa.h"
#include "cfgloop.h"
#include "tree-chrec.h"
+#include "internal-fn.h"
+#include "graphite-oacc.h"
#include "tree-affine.h"
#include "tree-scalar-evolution.h"
#include "dumpfile.h"
@@ -284,6 +288,8 @@ along with GCC; see the file COPYING3. If not see
#include "tree-into-ssa.h"
#include "builtins.h"
#include "case-cfn-macros.h"
+#include "omp-offload.h"
+#include "internal-fn.h"
static tree analyze_scalar_evolution_1 (class loop *, tree);
static tree analyze_scalar_evolution_for_address_of (class loop *loop,
@@ -311,7 +317,19 @@ struct scev_info_hasher : ggc_ptr_hash<scev_info_str>
static GTY (()) hash_table<scev_info_hasher> *scalar_evolution_info;
-
+/* This flag indicates that internal OpenACC calls should be analyzed.
+ The analysis is not valid in general. It is used to allow Graphite
+ to analyze the partially lowered OpenACC loops as if it was seeing
+ the unlowered loops. */
+
+static bool analyze_openacc_calls = false;
+
+void set_scev_analyze_openacc_calls (bool analyze)
+{
+ analyze_openacc_calls = analyze;
+}
+
+
/* Constructs a new SCEV_INFO_STR structure for VAR and INSTANTIATED_BELOW. */
static inline struct scev_info_str *
@@ -577,6 +595,53 @@ get_scalar_evolution (basic_block instantiated_below, tree scalar)
return res;
}
+bool
+oacc_call_analyzable_p (gimple *stmt)
+{
+ return analyze_openacc_calls
+ && gimple_call_internal_p (stmt, IFN_GOACC_LOOP);
+}
+
+bool
+oacc_call_analyzable_p (tree t)
+{
+ return TREE_CODE (t) == SSA_NAME
+ && oacc_call_analyzable_p (SSA_NAME_DEF_STMT (t));
+}
+
+/* Extract loop information from a OpenACC internal function call. */
+
+tree
+oacc_ifn_call_extract (gimple *stmt)
+{
+ gcall *call = as_a<gcall *> (stmt);
+
+ if (oacc_call_analyzable_p (stmt))
+ {
+ gcc_assert (gimple_call_internal_p (stmt, IFN_GOACC_LOOP));
+ return oacc_extract_loop_call (as_a<gcall *> (stmt));
+ }
+
+ return chrec_dont_know;
+}
+
+/* If EXPR is a analyzable internal OpenACC function call,
+ return the result of its analysis; otherwise return EXPR. */
+
+tree
+oacc_simplify (tree expr)
+{
+ if (expr == NULL || TREE_CODE (expr) != SSA_NAME)
+ return expr;
+
+ gimple *def = SSA_NAME_DEF_STMT (expr);
+
+ if (oacc_call_analyzable_p (def))
+ return oacc_ifn_call_extract (def);
+
+ return expr;
+}
+
/* Helper function for add_to_evolution. Returns the evolution
function for an assignment of the form "a = b + c", where "a" and
"b" are on the strongly connected component. CHREC_BEFORE is the
@@ -794,6 +859,8 @@ add_to_evolution (unsigned loop_nb, tree chrec_before, enum tree_code code,
if (to_add == NULL_TREE)
return chrec_before;
+ to_add = oacc_simplify (to_add);
+
/* TO_ADD is either a scalar, or a parameter. TO_ADD is not
instantiated at this point. */
if (TREE_CODE (to_add) == POLYNOMIAL_CHREC)
@@ -966,6 +1033,7 @@ follow_ssa_edge_binary (class loop *loop, gimple *at_stmt,
res = t_false;
}
+ *evolution_of_loop = oacc_simplify (*evolution_of_loop);
return res;
}
@@ -1116,6 +1184,8 @@ follow_ssa_edge_inner_loop_phi (class loop *outer_loop,
evolution_of_loop, limit);
}
+tree interpret_gimple_call (class loop *loop, gimple *call);
+
/* Follow the ssa edge into the expression EXPR.
Return true if the strongly connected component has been found. */
@@ -1124,8 +1194,11 @@ follow_ssa_edge_expr (class loop *loop, gimple *at_stmt, tree expr,
gphi *halting_phi, tree *evolution_of_loop,
int limit)
{
- enum tree_code code;
- tree type, rhs0, rhs1 = NULL_TREE;
+ enum tree_code code = LAST_AND_UNUSED_TREE_CODE;
+ tree type = NULL_TREE;
+ tree rhs0 = NULL_TREE;
+ tree rhs1 = NULL_TREE;
+
/* The EXPR is one of the following cases:
- an SSA_NAME,
@@ -1140,6 +1213,7 @@ follow_ssa_edge_expr (class loop *loop, gimple *at_stmt, tree expr,
PHI nodes and otherwise expand appropriately for the expression
handling below. */
tail_recurse:
+ expr = oacc_simplify (expr);
if (TREE_CODE (expr) == SSA_NAME)
{
gimple *def = SSA_NAME_DEF_STMT (expr);
@@ -1187,28 +1261,37 @@ tail_recurse:
return t_false;
}
- /* At this level of abstraction, the program is just a set
- of GIMPLE_ASSIGNs and PHI_NODEs. In principle there is no
- other def to be handled. */
- if (!is_gimple_assign (def))
- return t_false;
+ /* At this level of abstraction, the program is just a set of
+ GIMPLE_ASSIGNs and PHI_NODEs. In principle there is no other def to
+ be handled except for OpenACC internal function calls. */
+ if (is_gimple_assign (def))
+ {
+ code = gimple_assign_rhs_code (def);
+
+ switch (get_gimple_rhs_class (code))
+ {
+ case GIMPLE_BINARY_RHS:
+ rhs0 = gimple_assign_rhs1 (def);
+ rhs1 = gimple_assign_rhs2 (def);
+ break;
+ case GIMPLE_UNARY_RHS:
+ case GIMPLE_SINGLE_RHS:
+ rhs0 = gimple_assign_rhs1 (def);
+ break;
+ default:
+ return t_false;
+ }
+ type = TREE_TYPE (gimple_assign_lhs (def));
+ at_stmt = def;
+ }
+ else if (oacc_call_analyzable_p (expr)) {
+ // TODO-kernels Is this still needed here?
+ rhs0 = interpret_gimple_call (loop, def);
+ type = TREE_TYPE (gimple_call_lhs (def));
+ at_stmt = def;
+ }
+ else return t_false;
- code = gimple_assign_rhs_code (def);
- switch (get_gimple_rhs_class (code))
- {
- case GIMPLE_BINARY_RHS:
- rhs0 = gimple_assign_rhs1 (def);
- rhs1 = gimple_assign_rhs2 (def);
- break;
- case GIMPLE_UNARY_RHS:
- case GIMPLE_SINGLE_RHS:
- rhs0 = gimple_assign_rhs1 (def);
- break;
- default:
- return t_false;
- }
- type = TREE_TYPE (gimple_assign_lhs (def));
- at_stmt = def;
}
else
{
@@ -1473,6 +1556,7 @@ follow_copies_to_constant (tree var)
else
break;
}
+ res = oacc_simplify (res);
if (CONSTANT_CLASS_P (res))
return res;
return var;
@@ -1506,6 +1590,7 @@ analyze_initial_condition (gphi *loop_phi_node)
tree branch = PHI_ARG_DEF (loop_phi_node, i);
basic_block bb = gimple_phi_arg_edge (loop_phi_node, i)->src;
+ branch = oacc_simplify (branch);
/* When the branch is oriented to the loop's body, it does
not contribute to the initial condition. */
if (flow_bb_inside_loop_p (loop, bb))
@@ -1533,6 +1618,7 @@ analyze_initial_condition (gphi *loop_phi_node)
/* We may not have fully constant propagated IL. Handle degenerate PHIs here
to not miss important early loop unrollings. */
init_cond = follow_copies_to_constant (init_cond);
+ init_cond = oacc_simplify (init_cond);
if (dump_file && (dump_flags & TDF_SCEV))
{
@@ -1558,6 +1644,7 @@ interpret_loop_phi (class loop *loop, gphi *loop_phi_node)
/* Otherwise really interpret the loop phi. */
init_cond = analyze_initial_condition (loop_phi_node);
res = analyze_evolution_in_loop (loop_phi_node, init_cond);
+ init_cond = analyze_initial_condition (loop_phi_node);
/* Verify we maintained the correct initial condition throughout
possible conversions in the SSA chain. */
@@ -1630,8 +1717,11 @@ interpret_rhs_expr (class loop *loop, gimple *at_stmt,
return chrec_convert (type, rhs1, at_stmt);
if (code == SSA_NAME)
- return chrec_convert (type, analyze_scalar_evolution (loop, rhs1),
- at_stmt);
+ {
+ rhs1 = oacc_simplify (rhs1);
+ return chrec_convert (type, analyze_scalar_evolution (loop, rhs1),
+ at_stmt);
+ }
if (code == ASSERT_EXPR)
{
@@ -1920,7 +2010,25 @@ interpret_gimple_assign (class loop *loop, gimple *stmt)
gimple_assign_rhs2 (stmt));
}
-
+/* Interpret a gimple call statement. */
+
+tree
+interpret_gimple_call (class loop *loop __attribute__ ((__unused__)), gimple *call)
+{
+
+ /* Information about OpenACC loops is encoded in internal function calls.
+ Extract loop information from those calls. Ignore other calls for now. */
+ if (!oacc_call_analyzable_p (call))
+ return chrec_dont_know;
+
+ tree expr = oacc_ifn_call_extract (call);
+ tree analyzed = expr;
+
+ tree lhs = gimple_call_lhs (call);
+ gcc_assert (lhs);
+
+ return chrec_convert (TREE_TYPE (lhs), analyzed, call);
+}
/* This section contains all the entry points:
- number_of_iterations_in_loop,
@@ -1943,6 +2051,8 @@ analyze_scalar_evolution_1 (class loop *loop, tree var)
def = SSA_NAME_DEF_STMT (var);
bb = gimple_bb (def);
+ if (!bb)
+ return chrec_dont_know;
def_loop = bb->loop_father;
if (!flow_bb_inside_loop_p (loop, bb))
@@ -1969,6 +2079,10 @@ analyze_scalar_evolution_1 (class loop *loop, tree var)
res = interpret_gimple_assign (loop, def);
break;
+ case GIMPLE_CALL:
+ res = interpret_gimple_call (loop, def);
+ break;
+
case GIMPLE_PHI:
if (loop_phi_node_p (def))
res = interpret_loop_phi (loop, as_a <gphi *> (def));
@@ -2261,6 +2375,14 @@ instantiate_scev_name (edge instantiate_below,
class loop *def_loop;
basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (chrec));
+ if (oacc_call_analyzable_p (chrec))
+ {
+ tree res
+ = interpret_gimple_call (evolution_loop, SSA_NAME_DEF_STMT (chrec));
+
+ return res;
+ }
+
/* A parameter, nothing to do. */
if (!def_bb
|| !dominated_by_p (CDI_DOMINATORS, def_bb, instantiate_below->dest))
@@ -3375,6 +3497,9 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
return true;
}
+ if (oacc_call_analyzable_p (expr))
+ return false;
+
bool visited_p;
uint64_t &local_cost = cache.get_or_insert (expr, &visited_p);
if (visited_p)
@@ -42,6 +42,9 @@ extern bool simple_iv (class loop *, class loop *, tree, struct affine_iv *,
bool);
extern bool iv_can_overflow_p (class loop *, tree, tree, tree);
extern tree compute_overall_effect_of_inner_loop (class loop *, tree);
+extern void set_scev_analyze_openacc_calls (bool);
+extern bool oacc_call_analyzable_p (gimple);
+extern bool oacc_call_analyzable_p (tree);
/* Returns the basic block preceding LOOP, or the CFG entry block when
the loop is function's body. */
@@ -256,6 +256,17 @@ mark_stmt_if_obviously_necessary (gimple *stmt, bool aggressive)
if (gimple_has_side_effects (stmt))
{
mark_stmt_necessary (stmt, true);
+
+ /* The lhs of the OpenACC loop and reduction calls necessary,
+ cf. the lowering in omp-offload.c. */
+ if (gimple_call_internal_p (stmt, IFN_UNIQUE)
+ || gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
+ {
+ tree lhs = gimple_call_lhs (stmt);
+ if (lhs)
+ mark_operand_necessary (lhs);
+ }
+
return;
}
/* IFN_GOACC_LOOP calls are necessary in that they are used to
@@ -267,6 +278,9 @@ mark_stmt_if_obviously_necessary (gimple *stmt, bool aggressive)
if (gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
{
mark_stmt_necessary (stmt, true);
+ tree lhs = gimple_call_lhs (stmt);
+ gcc_assert (lhs);
+ mark_operand_necessary (lhs);
return;
}
if (!gimple_call_lhs (stmt))
@@ -1980,6 +1980,9 @@ simplify_replace_tree (tree expr, tree old, tree new_tree,
return (ret ? (do_fold ? fold (ret) : ret) : expr);
}
+bool oacc_call_analyzable_p (gimple* stmt);
+tree interpret_gimple_call (class loop *loop, gimple *call);
+
/* Expand definitions of ssa names in EXPR as long as they are simple
enough, and return the new expression. If STOP is specified, stop
expanding if EXPR equals to it. */
@@ -1995,6 +1998,9 @@ expand_simple_operations (tree expr, tree stop, hash_map<tree, tree> &cache)
if (expr == NULL_TREE)
return expr;
+ if (oacc_call_analyzable_p (expr))
+ expr = interpret_gimple_call (NULL, SSA_NAME_DEF_STMT (expr));
+
if (is_gimple_min_invariant (expr))
return expr;
@@ -155,6 +155,13 @@ make_pass_tree_loop (gcc::context *ctxt)
static bool
gate_oacc_kernels (function *fn)
{
+ if (param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE)
+ return false;
+
+ gcc_checking_assert (param_openacc_kernels
+ == OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ || param_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
if (!flag_openacc)
return false;
@@ -324,6 +331,10 @@ public:
/* opt_pass methods: */
virtual bool gate (function *)
{
+ if (param_openacc_kernels != OPENACC_KERNELS_DECOMPOSE_PARLOOPS
+ && param_openacc_kernels != OPENACC_KERNELS_PARLOOPS)
+ return false;
+
return (optimize
&& flag_openacc
/* Don't bother doing anything if the program has errors. */
@@ -3,6 +3,8 @@
/* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
aspects of that functionality. */
+/* { dg-additional-options "-O2" } for Graphite/"kernels". */
+
/* See also '../libgomp.oacc-fortran/parallel-dims.f90'. */
@@ -7,7 +7,7 @@
program main
integer :: w, arr(0:31)
- !$acc parallel num_gangs(32) num_workers(32) copyout(arr) ! { dg-warning "region is worker partitioned" }
+ !$acc parallel num_gangs(32) num_workers(32) copyout(arr) ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" }
!$acc loop gang private(w)
do j = 0, 31
w = 0
@@ -1,5 +1,6 @@
! { dg-do run }
! { dg-additional-options "-cpp" }
+! { dg-additional-options "-O2" } for Graphite
#define N (1024 * 512)
@@ -1,6 +1,7 @@
! Exercise the auto, independent, seq and tile loop clauses inside
! kernels regions.
+! { dg-additional-options "-O2" } for Graphite
! { dg-do run }
program loops
@@ -1,6 +1,7 @@
! { dg-do run }
! { dg-additional-options "-fopt-info-omp-all" }
! { dg-additional-options "--param=openacc-kernels=decompose" }
+! { dg-additional-options "-O2" } for Graphite
! It's only with Tcl 8.5 (released in 2007) that "the variable 'varName'
! passed to 'incr' may be unset, and in that case, it will be set to [...]",