@@ -229,6 +229,11 @@ struct gimplify_omp_ctx
int defaultmap[4];
};
+struct privatize_reduction
+{
+ tree ref_var, local_var;
+};
+
static struct gimplify_ctx *gimplify_ctxp;
static struct gimplify_omp_ctx *gimplify_omp_ctxp;
static bool in_omp_construct;
@@ -10811,6 +10816,95 @@ find_combined_omp_for (tree *tp, int *walk_subtrees, void *data)
return NULL_TREE;
}
+/* Helper function for localize_reductions. Replace all uses of REF_VAR with
+ LOCAL_VAR. */
+
+static tree
+localize_reductions_r (tree *tp, int *walk_subtrees, void *data)
+{
+ enum tree_code tc = TREE_CODE (*tp);
+ struct privatize_reduction *pr = (struct privatize_reduction *) data;
+
+ if (TYPE_P (*tp))
+ *walk_subtrees = 0;
+
+ switch (tc)
+ {
+ case INDIRECT_REF:
+ case MEM_REF:
+ if (TREE_OPERAND (*tp, 0) == pr->ref_var)
+ *tp = pr->local_var;
+
+ *walk_subtrees = 0;
+ break;
+
+ case VAR_DECL:
+ case PARM_DECL:
+ case RESULT_DECL:
+ if (*tp == pr->ref_var)
+ *tp = pr->local_var;
+
+ *walk_subtrees = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ return NULL_TREE;
+}
+
+/* OpenACC worker and vector loop state propagation requires reductions
+ to be inside local variables. This function replaces all reference-type
+ reductions variables associated with the loop with a local copy. It is
+ also used to create private copies of reduction variables for those
+ which are not associated with acc loops. */
+
+static void
+localize_reductions (tree clauses, tree body)
+{
+ tree c, var, type, new_var;
+ struct privatize_reduction pr;
+
+ for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
+ {
+ var = OMP_CLAUSE_DECL (c);
+
+ if (!lang_hooks.decls.omp_privatize_by_reference (var))
+ {
+ OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c) = NULL;
+ continue;
+ }
+
+ type = TREE_TYPE (TREE_TYPE (var));
+ new_var = create_tmp_var (type, IDENTIFIER_POINTER (DECL_NAME (var)));
+
+ pr.ref_var = var;
+ pr.local_var = new_var;
+
+ walk_tree (&body, localize_reductions_r, &pr, NULL);
+
+ OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c) = new_var;
+ }
+ else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_PRIVATE)
+ {
+ var = OMP_CLAUSE_DECL (c);
+
+ if (!lang_hooks.decls.omp_privatize_by_reference (var))
+ continue;
+
+ type = TREE_TYPE (TREE_TYPE (var));
+ new_var = create_tmp_var (type, IDENTIFIER_POINTER (DECL_NAME (var)));
+
+ pr.ref_var = var;
+ pr.local_var = new_var;
+
+ walk_tree (&body, localize_reductions_r, &pr, NULL);
+ }
+}
+
+
/* Gimplify the gross structure of an OMP_FOR statement. */
static enum gimplify_status
@@ -11017,6 +11111,23 @@ gimplify_omp_for (tree *expr_p, gimple_seq *pre_p)
gcc_unreachable ();
}
+ if (ort == ORT_ACC)
+ {
+ gimplify_omp_ctx *outer = gimplify_omp_ctxp;
+
+ while (outer
+ && outer->region_type != ORT_ACC_PARALLEL
+ && outer->region_type != ORT_ACC_KERNELS)
+ outer = outer->outer_context;
+
+ /* FIXME: Reductions only work in parallel regions at present. We avoid
+ doing the reduction localization transformation in kernels regions
+ here, because the code to remove reductions in kernels regions cannot
+ handle that. */
+ if (outer && outer->region_type == ORT_ACC_PARALLEL)
+ localize_reductions (OMP_FOR_CLAUSES (*expr_p), OMP_FOR_BODY (*expr_p));
+ }
+
/* Set OMP_CLAUSE_LINEAR_NO_COPYIN flag on explicit linear
clause for the IV. */
if (ort == ORT_SIMD && TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)) == 1)
@@ -12567,6 +12678,11 @@ gimplify_omp_workshare (tree *expr_p, gimple_seq *pre_p)
|| (ort & ORT_HOST_TEAMS) == ORT_HOST_TEAMS)
{
push_gimplify_context ();
+
+ /* FIXME: Reductions are not supported in kernels regions yet. */
+ if (/*ort == ORT_ACC_KERNELS ||*/ ort == ORT_ACC_PARALLEL)
+ localize_reductions (OMP_CLAUSES (expr), OMP_BODY (expr));
+
gimple *g = gimplify_and_return_first (OMP_BODY (expr), &body);
if (gimple_code (g) == GIMPLE_BIND)
pop_gimplify_context (g);
@@ -6689,9 +6689,9 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
{
tree orig = OMP_CLAUSE_DECL (c);
- tree var = maybe_lookup_decl (orig, ctx);
+ tree var;
tree ref_to_res = NULL_TREE;
- tree incoming, outgoing, v1, v2, v3;
+ tree incoming, outgoing;
bool is_private = false;
enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c);
@@ -6703,6 +6703,9 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
rcode = BIT_IOR_EXPR;
tree op = build_int_cst (unsigned_type_node, rcode);
+ var = OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c);
+ if (!var)
+ var = maybe_lookup_decl (orig, ctx);
if (!var)
var = orig;
@@ -6792,36 +6795,13 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
if (!ref_to_res)
ref_to_res = integer_zero_node;
- if (omp_is_reference (orig))
+ if (omp_is_reference (outgoing))
{
- tree type = TREE_TYPE (var);
- const char *id = IDENTIFIER_POINTER (DECL_NAME (var));
-
- if (!inner)
- {
- tree x = create_tmp_var (TREE_TYPE (type), id);
- gimplify_assign (var, build_fold_addr_expr (x), fork_seq);
- }
-
- v1 = create_tmp_var (type, id);
- v2 = create_tmp_var (type, id);
- v3 = create_tmp_var (type, id);
-
- gimplify_assign (v1, var, fork_seq);
- gimplify_assign (v2, var, fork_seq);
- gimplify_assign (v3, var, fork_seq);
-
- var = build_simple_mem_ref (var);
- v1 = build_simple_mem_ref (v1);
- v2 = build_simple_mem_ref (v2);
- v3 = build_simple_mem_ref (v3);
outgoing = build_simple_mem_ref (outgoing);
if (!TREE_CONSTANT (incoming))
incoming = build_simple_mem_ref (incoming);
}
- else
- v1 = v2 = v3 = var;
/* Determine position in reduction buffer, which may be used
by target. The parser has ensured that this is not a
@@ -6854,20 +6834,21 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
= build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
TREE_TYPE (var), 6, init_code,
unshare_expr (ref_to_res),
- v1, level, op, off);
+ var, level, op, off);
tree fini_call
= build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
TREE_TYPE (var), 6, fini_code,
unshare_expr (ref_to_res),
- v2, level, op, off);
+ var, level, op, off);
tree teardown_call
= build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
- TREE_TYPE (var), 6, teardown_code,
- ref_to_res, v3, level, op, off);
+ TREE_TYPE (var), 6,
+ teardown_code, ref_to_res, var,
+ level, op, off);
- gimplify_assign (v1, setup_call, &before_fork);
- gimplify_assign (v2, init_call, &after_fork);
- gimplify_assign (v3, fini_call, &before_join);
+ gimplify_assign (var, setup_call, &before_fork);
+ gimplify_assign (var, init_call, &after_fork);
+ gimplify_assign (var, fini_call, &before_join);
gimplify_assign (outgoing, teardown_call, &after_join);
}
@@ -255,7 +255,9 @@ enum omp_clause_code {
placeholder used in OMP_CLAUSE_REDUCTION_{INIT,MERGE}.
Operand 4: OMP_CLAUSE_REDUCTION_DECL_PLACEHOLDER: Another dummy
VAR_DECL placeholder, used like the above for C/C++ array
- reductions. */
+ reductions.
+ Operand 5: OMP_CLAUSE_REDUCTION_PRIVATE_DECL: A private VAR_DECL of
+ the original DECL associated with the reduction clause. */
OMP_CLAUSE_REDUCTION,
/* OpenMP clause: task_reduction (operator:variable_list). */
@@ -284,7 +284,7 @@ unsigned const char omp_clause_num_ops[] =
1, /* OMP_CLAUSE_SHARED */
1, /* OMP_CLAUSE_FIRSTPRIVATE */
2, /* OMP_CLAUSE_LASTPRIVATE */
- 5, /* OMP_CLAUSE_REDUCTION */
+ 6, /* OMP_CLAUSE_REDUCTION */
5, /* OMP_CLAUSE_TASK_REDUCTION */
5, /* OMP_CLAUSE_IN_REDUCTION */
1, /* OMP_CLAUSE_COPYIN */
@@ -12170,11 +12170,16 @@ walk_tree_1 (tree *tp, walk_tree_fn func, void *data,
WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
case OMP_CLAUSE_REDUCTION:
+ {
+ for (int i = 0; i < 6; i++)
+ WALK_SUBTREE (OMP_CLAUSE_OPERAND (*tp, i));
+ WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
+ }
+
case OMP_CLAUSE_TASK_REDUCTION:
case OMP_CLAUSE_IN_REDUCTION:
{
- int i;
- for (i = 0; i < 5; i++)
+ for (int i = 0; i < 5; i++)
WALK_SUBTREE (OMP_CLAUSE_OPERAND (*tp, i));
WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
}
@@ -1662,6 +1662,8 @@ class auto_suppress_location_wrappers
#define OMP_CLAUSE_REDUCTION_DECL_PLACEHOLDER(NODE) \
OMP_CLAUSE_OPERAND (OMP_CLAUSE_RANGE_CHECK (NODE, OMP_CLAUSE_REDUCTION, \
OMP_CLAUSE_IN_REDUCTION), 4)
+#define OMP_CLAUSE_REDUCTION_PRIVATE_DECL(NODE) \
+ OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_REDUCTION), 5)
/* True if a REDUCTION clause may reference the original list item (omp_orig)
in its OMP_CLAUSE_REDUCTION_{,GIMPLE_}INIT. */
new file mode 100644
@@ -0,0 +1,64 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+void workers (void)
+{
+ double res[65536];
+ int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(64)
+ {
+ int i, j;
+#pragma acc loop gang
+ for (i = 0; i < 256; i++)
+ {
+#pragma acc loop worker
+ for (j = 0; j < 256; j++)
+ {
+ int tmpvar;
+ int &tmpref = tmpvar;
+ tmpref = (i * 256 + j) * 99;
+ res[i * 256 + j] = tmpref;
+ }
+ }
+ }
+
+ for (i = 0; i < 65536; i++)
+ if (res[i] != i * 99)
+ abort ();
+}
+
+void vectors (void)
+{
+ double res[65536];
+ int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(64)
+ {
+ int i, j;
+#pragma acc loop gang worker
+ for (i = 0; i < 256; i++)
+ {
+#pragma acc loop vector
+ for (j = 0; j < 256; j++)
+ {
+ int tmpvar;
+ int &tmpref = tmpvar;
+ tmpref = (i * 256 + j) * 101;
+ res[i * 256 + j] = tmpref;
+ }
+ }
+ }
+
+ for (i = 0; i < 65536; i++)
+ if (res[i] != i * 101)
+ abort ();
+}
+
+int main (int argc, char *argv[])
+{
+ workers ();
+ vectors ();
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,64 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+void workers (void)
+{
+ double res[65536];
+ int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(64)
+ {
+ int i, j;
+ int tmpvar;
+ int &tmpref = tmpvar;
+#pragma acc loop gang
+ for (i = 0; i < 256; i++)
+ {
+#pragma acc loop worker private(tmpref)
+ for (j = 0; j < 256; j++)
+ {
+ tmpref = (i * 256 + j) * 99;
+ res[i * 256 + j] = tmpref;
+ }
+ }
+ }
+
+ for (i = 0; i < 65536; i++)
+ if (res[i] != i * 99)
+ abort ();
+}
+
+void vectors (void)
+{
+ double res[65536];
+ int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(64)
+ {
+ int i, j;
+ int tmpvar;
+ int &tmpref = tmpvar;
+#pragma acc loop gang worker
+ for (i = 0; i < 256; i++)
+ {
+#pragma acc loop vector private(tmpref)
+ for (j = 0; j < 256; j++)
+ {
+ tmpref = (i * 256 + j) * 101;
+ res[i * 256 + j] = tmpref;
+ }
+ }
+ }
+
+ for (i = 0; i < 65536; i++)
+ if (res[i] != i * 101)
+ abort ();
+}
+
+int main (int argc, char *argv[])
+{
+ workers ();
+ vectors ();
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,71 @@
+! { dg-do run }
+
+program main
+ implicit none
+ integer :: myint
+ integer :: i
+ real :: res(65536), tmp
+
+ res(:) = 0.0
+
+ myint = 5
+ call workers(myint, res)
+
+ do i=1,65536
+ tmp = i * 99
+ if (res(i) .ne. tmp) stop 1
+ end do
+
+ res(:) = 0.0
+
+ myint = 7
+ call vectors(myint, res)
+
+ do i=1,65536
+ tmp = i * 101
+ if (res(i) .ne. tmp) stop 2
+ end do
+
+contains
+
+ subroutine workers(t1, res)
+ implicit none
+ integer :: t1
+ integer :: i, j
+ real, intent(out) :: res(:)
+
+ !$acc parallel copyout(res) num_gangs(64) num_workers(64)
+
+ !$acc loop gang
+ do i=0,255
+ !$acc loop worker private(t1)
+ do j=1,256
+ t1 = (i * 256 + j) * 99
+ res(i * 256 + j) = t1
+ end do
+ end do
+
+ !$acc end parallel
+ end subroutine workers
+
+ subroutine vectors(t1, res)
+ implicit none
+ integer :: t1
+ integer :: i, j
+ real, intent(out) :: res(:)
+
+ !$acc parallel copyout(res) num_gangs(64) num_workers(64)
+
+ !$acc loop gang worker
+ do i=0,255
+ !$acc loop vector private(t1)
+ do j=1,256
+ t1 = (i * 256 + j) * 101
+ res(i * 256 + j) = t1
+ end do
+ end do
+
+ !$acc end parallel
+ end subroutine vectors
+
+end program main