@@ -1,5 +1,22 @@
2015-04-21 Tom de Vries <tom@codesourcery.com>
+ * passes.def: Add pass_parallelize_loops_oacc_kernels in pass group
+ pass_oacc_kernels.
+ * tree-parloops.c (create_parallel_loop, gen_parallel_loop): Add
+ function parameters region_entry and bool oacc_kernels_p. Handle
+ oacc_kernels_p.
+ Call create_parallel_loop with additional args.
+ (parallelize_loops): Add function parameter oacc_kernels_p. Calculate
+ dominance info. Skip loops that are not in a kernels region. Call
+ gen_parallel_loop with additional args.
+ (pass_parallelize_loops::execute): Call parallelize_loops with false
+ argument.
+ (pass_data_parallelize_loops_oacc_kernels): New pass_data.
+ (class pass_parallelize_loops_oacc_kernels): New pass.
+ (pass_parallelize_loops_oacc_kernels::execute)
+ (make_pass_parallelize_loops_oacc_kernels): New function.
+ * tree-pass.h (make_pass_parallelize_loops_oacc_kernels): Declare.
+
* passes.def: Add pass_copy_prop to pass group pass_oacc_kernels.
* tree-ssa-copy.c (stmt_may_generate_copy): Handle .omp_data_i init
conservatively.
@@ -94,6 +94,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_tree_loop_init);
NEXT_PASS (pass_lim);
NEXT_PASS (pass_copy_prop);
+ NEXT_PASS (pass_parallelize_loops_oacc_kernels);
NEXT_PASS (pass_expand_omp_ssa);
NEXT_PASS (pass_tree_loop_done);
POP_INSERT_PASSES ()
@@ -1,6 +1,11 @@
2015-04-21 Tom de Vries <tom@codesourcery.com>
Thomas Schwinge <thomas@codesourcery.com>
+ * c-c++-common/goacc/kernels-loop-2.c: New test.
+ * c-c++-common/goacc/kernels-loop.c: New test.
+ * c-c++-common/goacc/kernels-loop-n.c: New test.
+ * c-c++-common/goacc/kernels-loop-mod-not-zero.c: New test.
+
* c-c++-common/restrict-2.c: Update for new pass_lim.
* c-c++-common/restrict-4.c: Same.
* g++.dg/tree-ssa/pr33615.C: Same.
new file mode 100644
@@ -0,0 +1,62 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc kernels copyout (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+#pragma acc kernels copyout (b[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
new file mode 100644
@@ -0,0 +1,53 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N ((1024 * 512) + 1)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
new file mode 100644
@@ -0,0 +1,48 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* TODO: parallelize this example. */
+
+#include <stdlib.h>
+
+#define N ((1024 * 512) + 1)
+#define COUNTERTYPE unsigned int
+
+static int __attribute__((noinline,noclone))
+foo (COUNTERTYPE n)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:n], b[0:n]) copyout (c[0:n])
+ {
+ for (COUNTERTYPE ii = 0; ii < n; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+int
+main (void)
+{
+ return foo (N);
+}
new file mode 100644
@@ -0,0 +1,53 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
@@ -1612,9 +1612,10 @@ transform_to_exit_first_loop (struct loop *loop,
of LOOP_FN. N_THREADS is the requested number of threads. Returns the
basic block containing GIMPLE_OMP_PARALLEL tree. */
-static basic_block
+static void
create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
- tree new_data, unsigned n_threads, location_t loc)
+ tree new_data, unsigned n_threads, location_t loc,
+ basic_block region_entry, bool oacc_kernels_p)
{
gimple_stmt_iterator gsi;
basic_block bb, paral_bb, for_bb, ex_bb;
@@ -1631,15 +1632,69 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
/* Prepare the GIMPLE_OMP_PARALLEL statement. */
bb = loop_preheader_edge (loop)->src;
paral_bb = single_pred (bb);
- gsi = gsi_last_bb (paral_bb);
+ if (!oacc_kernels_p)
+ gsi = gsi_last_bb (paral_bb);
+ else
+ /* Make sure the oacc parallel is inserted on top of the oacc kernels
+ region. */
+ gsi = gsi_last_bb (region_entry);
- t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
- OMP_CLAUSE_NUM_THREADS_EXPR (t)
- = build_int_cst (integer_type_node, n_threads);
- omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
- gimple_set_location (omp_par_stmt, loc);
+ if (!oacc_kernels_p)
+ {
+ t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+ OMP_CLAUSE_NUM_THREADS_EXPR (t)
+ = build_int_cst (integer_type_node, n_threads);
+ omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+ gimple_set_location (omp_par_stmt, loc);
- gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
+ gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
+ }
+ else
+ {
+ /* Create oacc parallel pragma based on oacc kernels pragma. */
+ gomp_target *kernels = as_a <gomp_target *> (gsi_stmt (gsi));
+
+ tree clauses = gimple_omp_target_clauses (kernels);
+ /* FIXME: We need a more intelligent mapping onto vector, gangs,
+ workers. */
+ if (1)
+ {
+ tree clause = build_omp_clause (gimple_location (kernels),
+ OMP_CLAUSE_VECTOR_LENGTH);
+ OMP_CLAUSE_VECTOR_LENGTH_EXPR (clause)
+ = build_int_cst (integer_type_node, n_threads);
+ OMP_CLAUSE_CHAIN (clause) = clauses;
+ clauses = clause;
+ }
+ gomp_target *stmt
+ = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_PARALLEL,
+ clauses);
+ tree child_fn = gimple_omp_target_child_fn (kernels);
+ gimple_omp_target_set_child_fn (stmt, child_fn);
+ tree data_arg = gimple_omp_target_data_arg (kernels);
+ gimple_omp_target_set_data_arg (stmt, data_arg);
+
+ gimple_set_location (stmt, loc);
+
+ /* Insert oacc parallel pragma after the oacc kernels pragma. */
+ {
+ gimple_stmt_iterator gsi2;
+ gsi = gsi_last_bb (region_entry);
+ gsi2 = gsi;
+ gsi_prev (&gsi2);
+
+ /* Insert pragma acc parallel. */
+ gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+
+ /* Remove GOACC_kernels. */
+ replace_uses_by (gimple_vdef (gsi_stmt (gsi2)),
+ gimple_vuse (gsi_stmt (gsi2)));
+ gsi_remove (&gsi2, true);
+
+ /* Remove pragma acc kernels. */
+ gsi_remove (&gsi2, true);
+ }
+ }
/* Initialize NEW_DATA. */
if (data)
@@ -1657,12 +1712,18 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
}
- /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
- bb = split_loop_exit_edge (single_dom_exit (loop));
- gsi = gsi_last_bb (bb);
- omp_return_stmt1 = gimple_build_omp_return (false);
- gimple_set_location (omp_return_stmt1, loc);
- gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
+ /* Skip insertion of OMP_RETURN for oacc_kernels_p. We've already generated
+ one when lowering the oacc kernels directive in
+ pass_lower_omp/lower_omp (). */
+ if (!oacc_kernels_p)
+ {
+ /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
+ bb = split_loop_exit_edge (single_dom_exit (loop));
+ gsi = gsi_last_bb (bb);
+ omp_return_stmt1 = gimple_build_omp_return (false);
+ gimple_set_location (omp_return_stmt1, loc);
+ gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
+ }
/* Extract data for GIMPLE_OMP_FOR. */
gcc_assert (loop->header == single_dom_exit (loop)->src);
@@ -1719,7 +1780,11 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
- for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
+ for_stmt = gimple_build_omp_for (NULL,
+ (oacc_kernels_p
+ ? GF_OMP_FOR_KIND_OACC_LOOP
+ : GF_OMP_FOR_KIND_FOR),
+ NULL_TREE, 1, NULL);
gimple_set_location (for_stmt, loc);
gimple_omp_for_set_index (for_stmt, 0, initvar);
gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
@@ -1749,8 +1814,6 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
/* After the above dom info is hosed. Re-compute it. */
free_dominance_info (CDI_DOMINATORS);
calculate_dominance_info (CDI_DOMINATORS);
-
- return paral_bb;
}
/* Generates code to execute the iterations of LOOP in N_THREADS
@@ -1762,7 +1825,8 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
static void
gen_parallel_loop (struct loop *loop,
reduction_info_table_type *reduction_list,
- unsigned n_threads, struct tree_niter_desc *niter)
+ unsigned n_threads, struct tree_niter_desc *niter,
+ basic_block region_entry, bool oacc_kernels_p)
{
tree many_iterations_cond, type, nit;
tree arg_struct, new_arg_struct;
@@ -1843,41 +1907,44 @@ gen_parallel_loop (struct loop *loop,
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- if (loop->inner)
- m_p_thread=2;
- else
- m_p_thread=MIN_PER_THREAD;
-
- many_iterations_cond =
- fold_build2 (GE_EXPR, boolean_type_node,
- nit, build_int_cst (type, m_p_thread * n_threads));
-
- many_iterations_cond
- = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
- invert_truthvalue (unshare_expr (niter->may_be_zero)),
- many_iterations_cond);
- many_iterations_cond
- = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
- if (stmts)
- gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- if (!is_gimple_condexpr (many_iterations_cond))
+ if (!oacc_kernels_p)
{
+ if (loop->inner)
+ m_p_thread=2;
+ else
+ m_p_thread=MIN_PER_THREAD;
+
+ many_iterations_cond =
+ fold_build2 (GE_EXPR, boolean_type_node,
+ nit, build_int_cst (type, m_p_thread * n_threads));
+
many_iterations_cond
- = force_gimple_operand (many_iterations_cond, &stmts,
- true, NULL_TREE);
+ = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+ invert_truthvalue (unshare_expr (niter->may_be_zero)),
+ many_iterations_cond);
+ many_iterations_cond
+ = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+ if (!is_gimple_condexpr (many_iterations_cond))
+ {
+ many_iterations_cond
+ = force_gimple_operand (many_iterations_cond, &stmts,
+ true, NULL_TREE);
+ if (stmts)
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+ }
+
+ initialize_original_copy_tables ();
+
+ /* We assume that the loop usually iterates a lot. */
+ prob = 4 * REG_BR_PROB_BASE / 5;
+ loop_version (loop, many_iterations_cond, NULL,
+ prob, prob, REG_BR_PROB_BASE - prob, true);
+ update_ssa (TODO_update_ssa);
+ free_original_copy_tables ();
}
- initialize_original_copy_tables ();
-
- /* We assume that the loop usually iterates a lot. */
- prob = 4 * REG_BR_PROB_BASE / 5;
- loop_version (loop, many_iterations_cond, NULL,
- prob, prob, REG_BR_PROB_BASE - prob, true);
- update_ssa (TODO_update_ssa);
- free_original_copy_tables ();
-
/* Base all the induction variables in LOOP on a single control one. */
canonicalize_loop_ivs (loop, &nit, true);
@@ -1893,19 +1960,30 @@ gen_parallel_loop (struct loop *loop,
entry = loop_preheader_edge (loop);
exit = single_dom_exit (loop);
- eliminate_local_variables (entry, exit);
- /* In the old loop, move all variables non-local to the loop to a structure
- and back, and create separate decls for the variables used in loop. */
- separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
- &new_arg_struct, &clsn_data);
+ /* This rewrites the body in terms of new variables. This has already
+ been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */
+ if (!oacc_kernels_p)
+ {
+ eliminate_local_variables (entry, exit);
+ /* In the old loop, move all variables non-local to the loop to a
+ structure and back, and create separate decls for the variables used in
+ loop. */
+ separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
+ &new_arg_struct, &clsn_data);
+ }
+ else
+ {
+ arg_struct = NULL_TREE;
+ new_arg_struct = NULL_TREE;
+ }
/* Create the parallel constructs. */
loc = UNKNOWN_LOCATION;
cond_stmt = last_stmt (loop->header);
if (cond_stmt)
loc = gimple_location (cond_stmt);
- create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
- new_arg_struct, n_threads, loc);
+ create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct,
+ n_threads, loc, region_entry, oacc_kernels_p);
if (reduction_list->elements () > 0)
create_call_for_reduction (loop, reduction_list, &clsn_data);
@@ -2145,7 +2223,7 @@ try_create_reduction_list (loop_p loop,
otherwise. */
static bool
-parallelize_loops (void)
+parallelize_loops (bool oacc_kernels_p)
{
unsigned n_threads = flag_tree_parallelize_loops;
bool changed = false;
@@ -2154,6 +2232,7 @@ parallelize_loops (void)
struct obstack parloop_obstack;
HOST_WIDE_INT estimated;
source_location loop_loc;
+ basic_block region_entry, region_exit;
/* Do not parallelize loops in the functions created by parallelization. */
if (parallelized_function_p (cfun->decl))
@@ -2165,9 +2244,46 @@ parallelize_loops (void)
reduction_info_table_type reduction_list (10);
init_stmt_vec_info_vec ();
+ calculate_dominance_info (CDI_DOMINATORS);
+
FOR_EACH_LOOP (loop, 0)
{
reduction_list.empty ();
+
+ if (oacc_kernels_p)
+ {
+ if (!loop_in_oacc_kernels_region_p (loop, ®ion_entry,
+ ®ion_exit))
+ continue;
+
+ /* TODO: Allow nested loops. */
+ if (loop->inner)
+ continue;
+
+ gcc_assert (single_succ_p (region_entry));
+ basic_block first = single_succ (region_entry);
+
+ /* TODO: Allow conditional loop entry. This test triggers when the
+ loop bound is not known at compile time. */
+ if (!single_succ_p (first))
+ continue;
+
+ /* TODO: allow more complex loops. */
+ if (single_exit (loop) == NULL)
+ continue;
+
+ /* TODO: Allow other code than a single loop inside a kernels
+ region. */
+ if (loop->header != single_succ (first)
+ || single_exit (loop)->dest != region_exit)
+ continue;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ "Trying loop %d with header bb %d in oacc kernels region\n",
+ loop->num, loop->header->index);
+ }
+
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
@@ -2209,6 +2325,7 @@ parallelize_loops (void)
/* FIXME: Bypass this check as graphite doesn't update the
count and frequency correctly now. */
if (!flag_loop_parallelize_all
+ && !oacc_kernels_p
&& ((estimated != -1
&& estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
/* Do not bother with loops in cold areas. */
@@ -2237,8 +2354,9 @@ parallelize_loops (void)
fprintf (dump_file, "\nloop at %s:%d: ",
LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
}
+
gen_parallel_loop (loop, &reduction_list,
- n_threads, &niter_desc);
+ n_threads, &niter_desc, region_entry, oacc_kernels_p);
}
free_stmt_vec_info_vec ();
@@ -2289,7 +2407,7 @@ pass_parallelize_loops::execute (function *fun)
if (number_of_loops (fun) <= 1)
return 0;
- if (parallelize_loops ())
+ if (parallelize_loops (false))
{
fun->curr_properties &= ~(PROP_gimple_eomp);
return TODO_update_ssa;
@@ -2305,3 +2423,51 @@ make_pass_parallelize_loops (gcc::context *ctxt)
{
return new pass_parallelize_loops (ctxt);
}
+
+namespace {
+
+const pass_data pass_data_parallelize_loops_oacc_kernels =
+{
+ GIMPLE_PASS, /* type */
+ "parloops_oacc_kernels", /* name */
+ OPTGROUP_LOOP, /* optinfo_flags */
+ TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
+ ( PROP_cfg | PROP_ssa ), /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass
+{
+public:
+ pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *) { return flag_tree_parallelize_loops > 1; }
+ virtual unsigned int execute (function *);
+
+}; // class pass_parallelize_loops_oacc_kernels
+
+unsigned
+pass_parallelize_loops_oacc_kernels::execute (function *fun)
+{
+ if (number_of_loops (fun) <= 1)
+ return 0;
+
+ if (parallelize_loops (true))
+ return TODO_update_ssa;
+
+ return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+{
+ return new pass_parallelize_loops_oacc_kernels (ctxt);
+}
@@ -375,6 +375,8 @@ extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
+extern gimple_opt_pass *
+ make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
@@ -1,3 +1,12 @@
+2015-04-21 Tom de Vries <tom@codesourcery.com>
+ Thomas Schwinge <thomas@codesourcery.com>
+
+ * testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c: New test.
+ * testsuite/libgomp.oacc-c-c++-common/kernels-loop.c: New test.
+ * testsuite/libgomp.oacc-c-c++-common/kernels-loop-n.c: New test.
+ * testsuite/libgomp.oacc-c-c++-common/kernels-loop-mod-not-zero.c:
+ New test.
+
2015-03-13 Thomas Schwinge <thomas@codesourcery.com>
* testsuite/libgomp.fortran/fortran.exp (DG_TORTURE_OPTIONS): Add
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc kernels copyout (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+#pragma acc kernels copyout (b[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N ((1024 * 512) + 1)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N ((1024 * 512) + 1)
+#define COUNTERTYPE unsigned int
+
+static int __attribute__((noinline,noclone))
+foo (COUNTERTYPE n)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:n], b[0:n]) copyout (c[0:n])
+ {
+ for (COUNTERTYPE ii = 0; ii < n; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+int
+main (void)
+{
+ return foo (N);
+}
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}