@@ -38,7 +38,7 @@ extern rtx gcn_full_exec ();
extern rtx gcn_full_exec_reg ();
extern rtx gcn_gen_undef (machine_mode);
extern bool gcn_global_address_p (rtx);
-extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
+extern tree gcn_goacc_create_propagation_record (tree record_type, bool sender,
const char *name);
extern tree gcn_goacc_adjust_private_decl (tree var, int level);
extern void gcn_goacc_reduction (gcall *call);
@@ -548,12 +548,12 @@ gcn_goacc_reduction (gcall *call)
}
}
-/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD.
+/* Implement TARGET_GOACC_CREATE_PROPAGATION_RECORD.
- Tweak (worker) propagation record, e.g. to put it in shared memory. */
+ Create (worker) propagation record in shared memory. */
tree
-gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
+gcn_goacc_create_propagation_record (tree record_type, bool sender,
const char *name)
{
tree type = record_type;
@@ -3588,8 +3588,6 @@ gcn_init_builtins (void)
TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
}
-/* FIXME: remove the ifdef once OpenACC support is merged upstream. */
-#ifdef BUILT_IN_GOACC_SINGLE_START
/* These builtins need to take/return an LDS pointer: override the generic
versions here. */
@@ -3606,7 +3604,6 @@ gcn_init_builtins (void)
set_builtin_decl (BUILT_IN_GOACC_BARRIER,
gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
-#endif
}
/* Expand the CMP_SWAP GCN builtins. We have our own versions that do
@@ -4865,11 +4862,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
unsigned /*used*/)
{
bool changed = false;
-
- /* FIXME: remove -facc-experimental-workers when they're ready. */
- int max_workers = flag_worker_partitioning ? 16 : 1;
-
- gcc_assert (!flag_worker_partitioning);
+ const int max_workers = 16;
/* The vector size must appear to be 64, to the user, unless this is a
SEQ routine. The real, internal value is always 1, which means use
@@ -4906,8 +4899,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
{
dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
if (dims[GOMP_DIM_WORKER] < 0)
- dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
- ? GCN_DEFAULT_WORKERS : 1);
+ dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
if (dims[GOMP_DIM_GANG] < 0)
dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
changed = true;
@@ -4972,8 +4964,7 @@ static bool
gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
bool ARG_UNUSED (is_fork))
{
- /* GCN does not use the fork/join concept invented for NVPTX.
- Instead we use standard autovectorization. */
+ /* GCN does not need to expand fork/join markers at the RTL level. */
return false;
}
@@ -6314,9 +6305,9 @@ gcn_dwarf_register_span (rtx rtl)
#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
-#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD
-#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
- gcn_goacc_adjust_propagation_record
+#undef TARGET_GOACC_CREATE_PROPAGATION_RECORD
+#define TARGET_GOACC_CREATE_PROPAGATION_RECORD \
+ gcn_goacc_create_propagation_record
#undef TARGET_GOACC_ADJUST_PRIVATE_DECL
#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
#undef TARGET_GOACC_FORK_JOIN
@@ -6325,6 +6316,8 @@ gcn_dwarf_register_span (rtx rtl)
#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
#undef TARGET_GOACC_VALIDATE_DIMS
#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
+#undef TARGET_GOACC_WORKER_PARTITIONING
+#define TARGET_GOACC_WORKER_PARTITIONING true
#undef TARGET_HARD_REGNO_MODE_OK
#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
#undef TARGET_HARD_REGNO_NREGS
@@ -62,11 +62,6 @@ bool flag_bypass_init_error = false
mbypass-init-error
Target RejectNegative Var(flag_bypass_init_error)
-bool flag_worker_partitioning = false
-
-macc-experimental-workers
-Target Var(flag_worker_partitioning) Init(0)
-
int stack_size_opt = -1
mstack-size=
@@ -3041,10 +3041,8 @@ gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
problem size, so let's do a reasonable number of single-worker gangs.
64 gangs matches a typical Fiji device. */
- /* NOTE: Until support for middle-end worker partitioning is merged, use 1
- for the default number of workers. */
if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
- if (dims[1] == 0) dims[1] = 1; /* Workers. */
+ if (dims[1] == 0) dims[1] = 16; /* Workers. */
/* The incoming dimensions are expressed in terms of gangs, workers, and
vectors. The HSA dimensions are expressed in terms of "work-items",
@@ -79,13 +79,18 @@ int check (const int *ary, int size, int gp, int wp, int vp)
exit = 1;
}
+#ifndef ACC_DEVICE_TYPE_radeon
+ /* AMD GCN uses the autovectorizer for the vector dimension: the use
+ of a function call in vector-partitioned code in this test is not
+ currently supported. */
for (ix = 0; ix < vp; ix++)
if (vectors[ix] != vectors[0])
{
printf ("vector %d not used %d times\n", ix, vectors[0]);
exit = 1;
}
-
+#endif
+
return exit;
}
@@ -132,9 +137,7 @@ int main ()
/* AMD GCN uses the autovectorizer for the vector dimension: the use
of a function call in vector-partitioned code in this test is not
currently supported. */
- /* AMD GCN does not currently support multiple workers. This should be
- set to 16 when that changes. */
- return test_1 (16, 1, 1);
+ return test_1 (16, 16, 64);
#else
return test_1 (16, 16, 32);
#endif
@@ -288,9 +288,8 @@ int main ()
}
else if (acc_on_device (acc_device_radeon))
{
- /* The GCC GCN back end is limited to num_workers (16).
- Temporarily set this to 1 until multiple workers are permitted. */
- workers_actual = 1; // 16;
+ /* The GCC GCN back end is limited to num_workers (16). */
+ workers_actual = 16;
}
else
__builtin_abort ();
@@ -491,8 +490,6 @@ int main ()
}
else if (acc_on_device (acc_device_radeon))
{
- /* Temporary setting, until multiple workers are permitted. */
- workers_actual = 1;
/* See above comments about GCN vectors_actual. */
vectors_actual = 1;
}
@@ -618,9 +615,9 @@ int main ()
gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc serial copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
copy (gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max)
-/* { dg-warning "not gang partitioned" "" { target *-*-* } 619 } */
-/* { dg-warning "not worker partitioned" "" { target *-*-* } 619 } */
-/* { dg-warning "not vector partitioned" "" { target *-*-* } 619 } */
+/* { dg-warning "not gang partitioned" "" { target *-*-* } 616 } */
+/* { dg-warning "not worker partitioned" "" { target *-*-* } 616 } */
+/* { dg-warning "not vector partitioned" "" { target *-*-* } 616 } */
{
if (acc_on_device (acc_device_nvidia))
{
@@ -16,7 +16,8 @@
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_radeon))
return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
else
__builtin_abort ();
@@ -27,7 +28,8 @@
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_radeon))
return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
else
__builtin_abort ();
@@ -38,7 +40,8 @@
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_radeon))
return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
else
__builtin_abort ();