2014-11-14 Tom de Vries <tom@codesourcery.com>
* passes.def: Add pass_parallelize_loops_oacc_kernels in pass group
pass_oacc_kernels. Move pass_expand_omp_ssa into pass group
pass_oacc_kernels.
* tree-parloops.c (create_parallel_loop): Add function parameters
region_entry and bool oacc_kernels_p. Handle oacc_kernels_p.
(gen_parallel_loop): Same. Use omp_expand_local if oacc_kernels_p.
Call create_parallel_loop with additional args.
(parallelize_loops): Add function parameter oacc_kernels_p. Calculate
dominance info. Skip loops that are not in a kernels region. Call
gen_parallel_loop with additional args.
(pass_parallelize_loops::execute): Call parallelize_loops with false
argument.
(pass_data_parallelize_loops_oacc_kernels): New pass_data.
(class pass_parallelize_loops_oacc_kernels): New pass.
(pass_parallelize_loops_oacc_kernels::execute)
(make_pass_parallelize_loops_oacc_kernels): New function.
* tree-pass.h (make_pass_parallelize_loops_oacc_kernels): Declare.
* testsuite/libgomp.oacc-c/oacc-kernels-2-run.c: New test.
* testsuite/libgomp.oacc-c/oacc-kernels-run.c: New test.
* gcc.dg/oacc-kernels-2.c: New test.
* gcc.dg/oacc-kernels.c: New test.
---
gcc/passes.def | 3 +-
gcc/testsuite/gcc.dg/oacc-kernels-2.c | 79 +++++++
gcc/testsuite/gcc.dg/oacc-kernels.c | 71 ++++++
gcc/tree-parloops.c | 242 ++++++++++++++++-----
gcc/tree-pass.h | 2 +
.../testsuite/libgomp.oacc-c/oacc-kernels-2-run.c | 65 ++++++
.../testsuite/libgomp.oacc-c/oacc-kernels-run.c | 59 +++++
7 files changed, 465 insertions(+), 56 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/oacc-kernels-2.c
create mode 100644 gcc/testsuite/gcc.dg/oacc-kernels.c
create mode 100644 libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c
create mode 100644 libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c
@@ -80,9 +80,10 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_tree_loop_init);
NEXT_PASS (pass_lim);
NEXT_PASS (pass_ccp);
+ NEXT_PASS (pass_parallelize_loops_oacc_kernels);
+ NEXT_PASS (pass_expand_omp_ssa);
NEXT_PASS (pass_tree_loop_done);
POP_INSERT_PASSES ()
- NEXT_PASS (pass_expand_omp_ssa);
NEXT_PASS (pass_fre);
NEXT_PASS (pass_merge_phi);
NEXT_PASS (pass_cd_dce);
new file mode 100644
@@ -0,0 +1,79 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fopenacc } */
+/* { dg-options "-fopenacc -ftree-parallelize-loops=32 -O2 -std=c99 -fdump-tree-parloops_oacc_kernels-all -fdump-tree-copyrename" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+ unsigned int i;
+
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = malloc (N * sizeof (unsigned int));
+ b = malloc (N * sizeof (unsigned int));
+ c = malloc (N * sizeof (unsigned int));
+
+
+#pragma acc kernels copyout (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+#pragma acc kernels copyout (b[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ {
+ unsigned int sum = 0;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ sum += c[i];
+
+ printf ("sum: %u\n", sum);
+
+ if (sum != N_REF)
+ abort ();
+ }
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. It pops up first in
+ all_passes/pass_all_optimizations/pass_rename_ssa_copies. */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.0 " 1 "copyrename2" } } */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.1 " 1 "copyrename2" } } */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.2 " 1 "copyrename2" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "copyrename*" } } */
new file mode 100644
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fopenacc } */
+/* { dg-options "-fopenacc -ftree-parallelize-loops=32 -O2 -std=c99 -fdump-tree-parloops_oacc_kernels-all -fdump-tree-copyrename" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+ unsigned int i;
+
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = malloc (N * sizeof (unsigned int));
+ b = malloc (N * sizeof (unsigned int));
+ c = malloc (N * sizeof (unsigned int));
+
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ {
+ unsigned int sum = 0;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ sum += c[i];
+
+ printf ("sum: %u\n", sum);
+
+ if (sum != N_REF)
+ abort ();
+ }
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. It pops up first in
+ all_passes/pass_all_optimizations/pass_rename_ssa_copies. */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.0 " 1 "copyrename2" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "copyrename*" } } */
@@ -1611,7 +1611,8 @@ transform_to_exit_first_loop (struct loop *loop,
static basic_block
create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
- tree new_data, unsigned n_threads, location_t loc)
+ tree new_data, unsigned n_threads, location_t loc,
+ basic_block region_entry, bool oacc_kernels_p)
{
gimple_stmt_iterator gsi;
basic_block bb, paral_bb, for_bb, ex_bb;
@@ -1623,15 +1624,44 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
/* Prepare the GIMPLE_OMP_PARALLEL statement. */
bb = loop_preheader_edge (loop)->src;
paral_bb = single_pred (bb);
- gsi = gsi_last_bb (paral_bb);
+ if (!oacc_kernels_p)
+ gsi = gsi_last_bb (paral_bb);
+ else
+ /* Make sure the oacc parallel is inserted on top of the oacc kernels
+ region. */
+ gsi = gsi_last_bb (region_entry);
- t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
- OMP_CLAUSE_NUM_THREADS_EXPR (t)
- = build_int_cst (integer_type_node, n_threads);
- stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
- gimple_set_location (stmt, loc);
+ if (!oacc_kernels_p)
+ {
+ t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+ OMP_CLAUSE_NUM_THREADS_EXPR (t)
+ = build_int_cst (integer_type_node, n_threads);
+ stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+ gimple_set_location (stmt, loc);
- gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+ gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+ }
+ else
+ {
+ /* Create oacc parallel pragma based on oacc kernels pragma. */
+ gimple kernels = last_stmt (region_entry);
+ stmt = gimple_build_oacc_parallel (NULL,
+ gimple_oacc_kernels_clauses (kernels));
+ tree child_fn = gimple_oacc_kernels_child_fn (kernels);
+ gimple_oacc_parallel_set_child_fn (stmt, child_fn);
+ tree data_arg = gimple_oacc_kernels_data_arg (kernels);
+ gimple_oacc_parallel_set_data_arg (stmt, data_arg);
+
+ gimple_set_location (stmt, loc);
+
+ /* Insert oacc parallel pragma after the oacc kernels pragma. */
+ {
+ gimple_stmt_iterator gsi2;
+ gsi2 = gsi;
+ gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+ gsi_remove (&gsi2, true);
+ }
+ }
/* Initialize NEW_DATA. */
if (data)
@@ -1647,12 +1677,18 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
}
- /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
- bb = split_loop_exit_edge (single_dom_exit (loop));
- gsi = gsi_last_bb (bb);
- stmt = gimple_build_omp_return (false);
- gimple_set_location (stmt, loc);
- gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+ /* Skip insertion of OMP_RETURN for oacc_kernels_p. We've already generated
+ one when lowering the oacc kernels directive in
+ pass_lower_omp/lower_omp (). */
+ if (!oacc_kernels_p)
+ {
+ /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
+ bb = split_loop_exit_edge (single_dom_exit (loop));
+ gsi = gsi_last_bb (bb);
+ stmt = gimple_build_omp_return (false);
+ gimple_set_location (stmt, loc);
+ gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+ }
/* Extract data for GIMPLE_OMP_FOR. */
gcc_assert (loop->header == single_dom_exit (loop)->src);
@@ -1705,7 +1741,11 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
- for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
+ for_stmt = gimple_build_omp_for (NULL,
+ (oacc_kernels_p
+ ? GF_OMP_FOR_KIND_OACC_LOOP
+ : GF_OMP_FOR_KIND_FOR),
+ NULL_TREE, 1, NULL);
gimple_set_location (for_stmt, loc);
gimple_omp_for_set_index (for_stmt, 0, initvar);
gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
@@ -1736,7 +1776,7 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
free_dominance_info (CDI_DOMINATORS);
calculate_dominance_info (CDI_DOMINATORS);
- return paral_bb;
+ return oacc_kernels_p ? region_entry : paral_bb;
}
/* Generates code to execute the iterations of LOOP in N_THREADS
@@ -1748,11 +1788,13 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
static void
gen_parallel_loop (struct loop *loop,
reduction_info_table_type *reduction_list,
- unsigned n_threads, struct tree_niter_desc *niter)
+ unsigned n_threads, struct tree_niter_desc *niter,
+ basic_block region_entry, bool oacc_kernels_p)
{
tree many_iterations_cond, type, nit;
tree arg_struct, new_arg_struct;
gimple_seq stmts;
+ basic_block parallel_head;
edge entry, exit;
struct clsn_data clsn_data;
unsigned prob;
@@ -1829,40 +1871,43 @@ gen_parallel_loop (struct loop *loop,
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- if (loop->inner)
- m_p_thread=2;
- else
- m_p_thread=MIN_PER_THREAD;
-
- many_iterations_cond =
- fold_build2 (GE_EXPR, boolean_type_node,
- nit, build_int_cst (type, m_p_thread * n_threads));
-
- many_iterations_cond
- = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
- invert_truthvalue (unshare_expr (niter->may_be_zero)),
- many_iterations_cond);
- many_iterations_cond
- = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
- if (stmts)
- gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- if (!is_gimple_condexpr (many_iterations_cond))
+ if (!oacc_kernels_p)
{
+ if (loop->inner)
+ m_p_thread=2;
+ else
+ m_p_thread=MIN_PER_THREAD;
+
+ many_iterations_cond =
+ fold_build2 (GE_EXPR, boolean_type_node,
+ nit, build_int_cst (type, m_p_thread * n_threads));
+
+ many_iterations_cond
+ = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+ invert_truthvalue (unshare_expr (niter->may_be_zero)),
+ many_iterations_cond);
many_iterations_cond
- = force_gimple_operand (many_iterations_cond, &stmts,
- true, NULL_TREE);
+ = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- }
+ if (!is_gimple_condexpr (many_iterations_cond))
+ {
+ many_iterations_cond
+ = force_gimple_operand (many_iterations_cond, &stmts,
+ true, NULL_TREE);
+ if (stmts)
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+ }
- initialize_original_copy_tables ();
+ initialize_original_copy_tables ();
- /* We assume that the loop usually iterates a lot. */
- prob = 4 * REG_BR_PROB_BASE / 5;
- loop_version (loop, many_iterations_cond, NULL,
- prob, prob, REG_BR_PROB_BASE - prob, true);
- update_ssa (TODO_update_ssa);
- free_original_copy_tables ();
+ /* We assume that the loop usually iterates a lot. */
+ prob = 4 * REG_BR_PROB_BASE / 5;
+ loop_version (loop, many_iterations_cond, NULL,
+ prob, prob, REG_BR_PROB_BASE - prob, true);
+ update_ssa (TODO_update_ssa);
+ free_original_copy_tables ();
+ }
/* Base all the induction variables in LOOP on a single control one. */
canonicalize_loop_ivs (loop, &nit, true);
@@ -1879,19 +1924,31 @@ gen_parallel_loop (struct loop *loop,
entry = loop_preheader_edge (loop);
exit = single_dom_exit (loop);
- eliminate_local_variables (entry, exit);
- /* In the old loop, move all variables non-local to the loop to a structure
- and back, and create separate decls for the variables used in loop. */
- separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
- &new_arg_struct, &clsn_data);
+ /* This rewrites the body in terms of new variables. This has already
+ been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */
+ if (!oacc_kernels_p)
+ {
+ eliminate_local_variables (entry, exit);
+ /* In the old loop, move all variables non-local to the loop to a
+ structure and back, and create separate decls for the variables used in
+ loop. */
+ separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
+ &new_arg_struct, &clsn_data);
+ }
+ else
+ {
+ arg_struct = NULL_TREE;
+ new_arg_struct = NULL_TREE;
+ }
/* Create the parallel constructs. */
loc = UNKNOWN_LOCATION;
cond_stmt = last_stmt (loop->header);
if (cond_stmt)
loc = gimple_location (cond_stmt);
- create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
- new_arg_struct, n_threads, loc);
+ parallel_head = create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
+ new_arg_struct, n_threads, loc,
+ region_entry, oacc_kernels_p);
if (reduction_list->elements () > 0)
create_call_for_reduction (loop, reduction_list, &clsn_data);
@@ -1905,6 +1962,16 @@ gen_parallel_loop (struct loop *loop,
removed statements. */
FOR_EACH_LOOP (loop, 0)
free_numbers_of_iterations_estimates_loop (loop);
+
+ if (oacc_kernels_p)
+ {
+ /* Expand the parallel constructs. We do it directly here instead of
+ running a separate expand_omp pass, since it is more efficient, and
+ less likely to cause troubles with further analyses not being able to
+ deal with the OMP trees. */
+
+ omp_expand_local (parallel_head);
+ }
}
/* Returns true when LOOP contains vector phi nodes. */
@@ -2131,7 +2198,7 @@ try_create_reduction_list (loop_p loop,
otherwise. */
bool
-parallelize_loops (void)
+parallelize_loops (bool oacc_kernels_p)
{
unsigned n_threads = flag_tree_parallelize_loops;
bool changed = false;
@@ -2140,6 +2207,7 @@ parallelize_loops (void)
struct obstack parloop_obstack;
HOST_WIDE_INT estimated;
source_location loop_loc;
+ basic_block region_entry, region_exit;
/* Do not parallelize loops in the functions created by parallelization. */
if (parallelized_function_p (cfun->decl))
@@ -2151,9 +2219,25 @@ parallelize_loops (void)
reduction_info_table_type reduction_list (10);
init_stmt_vec_info_vec ();
+ calculate_dominance_info (CDI_DOMINATORS);
+
FOR_EACH_LOOP (loop, 0)
{
reduction_list.empty ();
+
+ if (oacc_kernels_p)
+ {
+ if (!loop_in_oacc_kernels_region_p (loop, ®ion_entry, ®ion_exit))
+ continue;
+ else
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ "Trying loop %d with header bb %d in oacc kernels region\n",
+ loop->num, loop->header->index);
+ }
+ }
+
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
@@ -2223,8 +2307,9 @@ parallelize_loops (void)
fprintf (dump_file, "\nloop at %s:%d: ",
LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
}
+
gen_parallel_loop (loop, &reduction_list,
- n_threads, &niter_desc);
+ n_threads, &niter_desc, region_entry, oacc_kernels_p);
}
free_stmt_vec_info_vec ();
@@ -2275,7 +2360,7 @@ pass_parallelize_loops::execute (function *fun)
if (number_of_loops (fun) <= 1)
return 0;
- if (parallelize_loops ())
+ if (parallelize_loops (false))
{
fun->curr_properties &= ~(PROP_gimple_eomp);
return TODO_update_ssa;
@@ -2293,4 +2378,51 @@ make_pass_parallelize_loops (gcc::context *ctxt)
}
+namespace {
+
+const pass_data pass_data_parallelize_loops_oacc_kernels =
+{
+ GIMPLE_PASS, /* type */
+ "parloops_oacc_kernels", /* name */
+ OPTGROUP_LOOP, /* optinfo_flags */
+ TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
+ ( PROP_cfg | PROP_ssa ), /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass
+{
+public:
+ pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *) { return flag_tree_parallelize_loops > 1; }
+ virtual unsigned int execute (function *);
+
+}; // class pass_parallelize_loops_oacc_kernels
+
+unsigned
+pass_parallelize_loops_oacc_kernels::execute (function *fun)
+{
+ if (number_of_loops (fun) <= 1)
+ return 0;
+
+ if (parallelize_loops (true))
+ return TODO_cleanup_cfg | TODO_rebuild_alias;
+ return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+{
+ return new pass_parallelize_loops_oacc_kernels (ctxt);
+}
+
#include "gt-tree-parloops.h"
@@ -370,6 +370,8 @@ extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
+extern gimple_opt_pass *
+ make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
new file mode 100644
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2 -std=c99" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+ unsigned int i;
+
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = malloc (N * sizeof (unsigned int));
+ b = malloc (N * sizeof (unsigned int));
+ c = malloc (N * sizeof (unsigned int));
+
+
+#pragma acc kernels copyout (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+#pragma acc kernels copyout (b[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ {
+ unsigned int sum = 0;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ sum += c[i];
+
+ printf ("sum: %u\n", sum);
+
+ if (sum != N_REF)
+ abort ();
+ }
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2 -std=c99" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+ unsigned int i;
+
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = malloc (N * sizeof (unsigned int));
+ b = malloc (N * sizeof (unsigned int));
+ c = malloc (N * sizeof (unsigned int));
+
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ {
+ unsigned int sum = 0;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ sum += c[i];
+
+ printf ("sum: %u\n", sum);
+
+ if (sum != N_REF)
+ abort ();
+ }
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
--
1.9.1