@@ -13,11 +13,12 @@ search_path = $(addprefix $(top_srcdir)/config/, $(config_path)) $(top_srcdir) \
fincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)$(MULTISUBDIR)/finclude
libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
+libgomp_la_LIBADD =
+
LIBFFI = @LIBFFI@
LIBFFIINCS = @LIBFFIINCS@
if USE_LIBFFI
-libgomp_la_LIBADD = $(LIBFFI)
+libgomp_la_LIBADD += $(LIBFFI)
endif
vpath % $(strip $(search_path))
@@ -26,6 +27,13 @@ AM_CPPFLAGS = $(addprefix -I, $(search_path)) $(LIBFFIINCS)
AM_CFLAGS = $(XCFLAGS)
AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
+#TODO Probably have to specify dependency in the top-level build system, too.
+libgomp_la_LIBADD += ../libbacktrace/libbacktrace.la
+AM_CPPFLAGS += \
+ -I$(srcdir)/$(MULTISRCTOP)../libbacktrace \
+ -I$(MULTIBUILDTOP)../libbacktrace \
+ -I../libbacktrace
+
toolexeclib_LTLIBRARIES = libgomp.la
nodist_toolexeclib_HEADERS = libgomp.spec
@@ -70,7 +78,9 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \
proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
- oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c
+ oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
+ oacc-profiling.c oacc-profiling-acc_register_library.c \
+ oacc-profiling-locinfo.c
include $(top_srcdir)/plugin/Makefrag.am
@@ -79,7 +89,7 @@ libgomp_la_SOURCES += openacc.f90
endif
nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h openacc.h
+nodist_libsubinclude_HEADERS = acc_prof.h omp.h openacc.h
if USE_FORTRAN
nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
new file mode 100644
@@ -0,0 +1,235 @@
+/* OpenACC Runtime Library: Profiling Interface
+
+ Copyright (C) 2017 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _ACC_PROF_H
+#define _ACC_PROF_H 1
+
+/* The OpenACC standard doesn't say so explicitly, but as its Profiling
+ Interface makes use of, for example, <openacc.h>'s acc_device_t, we
+ supposedly are to #include that file here. */
+#include <openacc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.1. Events. */
+
+typedef enum acc_event_t
+{
+ acc_ev_none = 0,
+ acc_ev_device_init_start,
+ acc_ev_device_init_end,
+ acc_ev_device_shutdown_start,
+ acc_ev_device_shutdown_end,
+ acc_ev_runtime_shutdown,
+ acc_ev_create,
+ acc_ev_delete,
+ acc_ev_alloc,
+ acc_ev_free,
+ acc_ev_enter_data_start,
+ acc_ev_enter_data_end,
+ acc_ev_exit_data_start,
+ acc_ev_exit_data_end,
+ acc_ev_update_start,
+ acc_ev_update_end,
+ acc_ev_compute_construct_start,
+ acc_ev_compute_construct_end,
+ acc_ev_enqueue_launch_start,
+ acc_ev_enqueue_launch_end,
+ acc_ev_enqueue_upload_start,
+ acc_ev_enqueue_upload_end,
+ acc_ev_enqueue_download_start,
+ acc_ev_enqueue_download_end,
+ acc_ev_wait_start,
+ acc_ev_wait_end,
+ acc_ev_last
+} acc_event_t;
+
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.2. Callbacks Signature. */
+
+/* 'In all cases, a datatype of "size_t" means a 32-bit integer for a 32-bit
+ binary and a 64-bit integer for a 64-bit binary, and a datatype "int" means
+ a 32-bit integer for both 32-bit and 64-bit binaries'. */
+typedef long int _acc_prof_size_t;
+typedef int _acc_prof_int_t;
+
+/* Internal helpers: a struct's "valid_bytes" may be less than its "sizeof". */
+#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \
+ offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
+#if 0 /* Untested. */
+#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
+ ((_n - 1) * sizeof (_type) + (_valid_bytes_type))
+#endif
+#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
+ (sizeof (_basictype))
+
+typedef struct acc_prof_info
+{
+ acc_event_t event_type;
+ _acc_prof_int_t valid_bytes;
+ _acc_prof_int_t version;
+ acc_device_t device_type;
+ _acc_prof_int_t device_number;
+ _acc_prof_int_t thread_id;
+ _acc_prof_size_t async;
+ _acc_prof_size_t async_queue;
+ const char *src_file;
+ const char *func_name;
+ _acc_prof_int_t line_no, end_line_no;
+ _acc_prof_int_t func_line_no, func_end_line_no;
+#define _ACC_PROF_INFO_VALID_BYTES \
+ _ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
+} acc_prof_info;
+
+/* We implement the OpenACC 2.5 Profiling Interface. */
+#define _ACC_PROF_INFO_VERSION 201510
+
+typedef enum acc_construct_t
+{
+ acc_construct_parallel = 0,
+ acc_construct_kernels,
+ acc_construct_loop,
+ acc_construct_data,
+ acc_construct_enter_data,
+ acc_construct_exit_data,
+ acc_construct_host_data,
+ acc_construct_atomic,
+ acc_construct_declare,
+ acc_construct_init,
+ acc_construct_shutdown,
+ acc_construct_set,
+ acc_construct_update,
+ acc_construct_routine,
+ acc_construct_wait,
+ acc_construct_runtime_api
+} acc_construct_t;
+
+typedef struct acc_data_event_info
+{
+ acc_event_t event_type;
+ _acc_prof_int_t valid_bytes;
+ acc_construct_t parent_construct;
+ _acc_prof_int_t implicit;
+ void *tool_info;
+ const char *var_name;
+ _acc_prof_size_t bytes;
+ void *host_ptr;
+ void *device_ptr;
+#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
+ _ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_data_event_info;
+
+typedef struct acc_launch_event_info
+{
+ acc_event_t event_type;
+ _acc_prof_int_t valid_bytes;
+ acc_construct_t parent_construct;
+ _acc_prof_int_t implicit;
+ void *tool_info;
+ const char *kernel_name;
+ _acc_prof_size_t num_gangs, num_workers, vector_length;
+#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
+ _ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
+} acc_launch_event_info;
+
+typedef struct acc_other_event_info
+{
+ acc_event_t event_type;
+ _acc_prof_int_t valid_bytes;
+ acc_construct_t parent_construct;
+ _acc_prof_int_t implicit;
+ void *tool_info;
+#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
+ _ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_other_event_info;
+
+typedef union acc_event_info
+{
+ acc_event_t event_type;
+ acc_data_event_info data_event;
+ acc_launch_event_info launch_event;
+ acc_other_event_info other_event;
+} acc_event_info;
+
+typedef enum acc_device_api
+{
+ acc_device_api_none = 0,
+ acc_device_api_cuda,
+ acc_device_api_opencl,
+ acc_device_api_coi,
+ acc_device_api_other
+} acc_device_api;
+
+typedef struct acc_api_info
+{
+ acc_device_api device_api;
+ _acc_prof_int_t valid_bytes;
+ acc_device_t device_type;
+ _acc_prof_int_t vendor;
+ void *device_handle;
+ void *context_handle;
+ void *async_handle;
+#define _ACC_API_INFO_VALID_BYTES \
+ _ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_api_info;
+
+typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
+ acc_api_info *);
+
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.3. Loading the Library. */
+
+typedef enum acc_register_t
+{
+ acc_reg = 0,
+ acc_toggle = 1,
+ acc_toggle_per_thread = 2
+} acc_register_t;
+
+typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
+extern void acc_prof_register (acc_event_t, acc_prof_callback, acc_register_t) __GOACC_NOTHROW;
+extern void acc_prof_unregister (acc_event_t, acc_prof_callback, acc_register_t) __GOACC_NOTHROW;
+typedef void (*acc_query_fn) ();
+typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
+extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
+/* Don't tag "acc_register_library" as "__GOACC_NOTHROW": this function can be
+ overridden by the application, and must be expected to do "everything". */
+extern void acc_register_library (acc_prof_reg, acc_prof_reg, acc_prof_lookup_func);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ACC_PROF_H */
@@ -1338,5 +1338,7 @@ initialize_env (void)
parse_acc_device_type ();
goacc_runtime_initialize ();
+
+ goacc_profiling_initialize ();
}
#endif /* LIBGOMP_OFFLOADED_ONLY */
@@ -29,6 +29,7 @@
#include <stdlib.h>
#include "libgomp.h"
+#include "oacc-int.h"
#include "libgomp-plugin.h"
void *
@@ -78,3 +79,11 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
gomp_vfatal (msg, ap);
va_end (ap);
}
+
+void
+GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
+ acc_event_info *event_info,
+ acc_api_info *api_info)
+{
+ goacc_profiling_dispatch (prof_info, event_info, api_info);
+}
@@ -33,6 +33,8 @@
#include <stddef.h>
#include <stdint.h>
+#include "acc_prof.h"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -88,6 +90,10 @@ extern void GOMP_PLUGIN_error (const char *, ...)
extern void GOMP_PLUGIN_fatal (const char *, ...)
__attribute__ ((noreturn, format (printf, 1, 2)));
+extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
+ acc_event_info *,
+ acc_api_info *);
+
/* Prototypes for functions implemented by libgomp plugins. */
extern const char *GOMP_OFFLOAD_get_name (void);
extern unsigned int GOMP_OFFLOAD_get_caps (void);
@@ -424,6 +424,10 @@ OACC_2.5 {
acc_get_default_async_h_;
acc_memcpy_from_device_async;
acc_memcpy_to_device_async;
+ acc_prof_lookup;
+ acc_prof_register;
+ acc_prof_unregister;
+ acc_register_library;
acc_set_default_async;
acc_set_default_async_h_;
acc_update_device_async;
@@ -482,3 +486,9 @@ GOMP_PLUGIN_1.2 {
global:
GOMP_PLUGIN_acc_thread_default_async;
} GOMP_PLUGIN_1.1;
+
+GOMP_PLUGIN_1.3 {
+ global:
+ GOMP_PLUGIN_goacc_profiling_dispatch;
+ GOMP_PLUGIN_goacc_thread;
+} GOMP_PLUGIN_1.2;
@@ -111,6 +111,7 @@ changed to GNU Offloading and Multi Processing Runtime Library.
asynchronous operations.
* OpenACC Library Interoperability:: OpenACC library interoperability with the
NVIDIA CUBLAS library.
+* OpenACC Profiling Interface::
* The libgomp ABI:: Notes on the external ABI presented by libgomp.
* Reporting Bugs:: How to report bugs in the GNU Offloading and
Multi Processing Runtime Library.
@@ -2843,13 +2844,15 @@ A.2.1.4.
@node OpenACC Environment Variables
@chapter OpenACC Environment Variables
-The variables @env{ACC_DEVICE_TYPE} and @env{ACC_DEVICE_NUM}
+The variables @env{ACC_DEVICE_TYPE}, @env{ACC_DEVICE_NUM},
+and @code{ACC_PROFLIB}
are defined by section 4 of the OpenACC specification in version 2.5.
The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
@menu
* ACC_DEVICE_TYPE::
* ACC_DEVICE_NUM::
+* ACC_PROFLIB::
* GCC_ACC_NOTIFY::
@end menu
@@ -2875,6 +2878,19 @@ The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
+@node ACC_PROFLIB
+@section @code{ACC_PROFLIB}
+@table @asis
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.5}, section
+4.3.
+@end table
+
+
+
@node GCC_ACC_NOTIFY
@section @code{GCC_ACC_NOTIFY}
@table @asis
@@ -3090,6 +3106,295 @@ Application Programming Interface}, version 2.5.}
+@c ---------------------------------------------------------------------
+@c OpenACC Profiling Interface
+@c ---------------------------------------------------------------------
+
+@node OpenACC Profiling Interface
+@chapter OpenACC Profiling Interface
+
+@section Implementation Status and Implementation-Defined Behavior
+
+We're implementing most of the Profiling Interface as defined by
+the OpenACC 2.5 specification. The specification doesn't
+clearly define some aspects of its Profiling Interface, so we're
+clarifying these as @emph{implementation-defined behavior} here. We
+already have reported to the OpenACC Technical Committee some issues,
+and will report more, later on.
+
+This implementation of the OpenACC Profiling Interface is tuned to
+keep the performance impact as low as possible when it's not in use.
+This is relevant, as the Profiling Interface affects all the
+@emph{hot} code paths (in the target code, not in the offloaded code).
+Users of the OpenACC Profiling Interface can be expected to understand
+that performance will always be impacted to some degree: for example,
+because of the @emph{runtime} (libgomp) calling into a third-party
+@emph{library} for every event that has been registered.
+
+This implementation of the OpenACC Profiling Interface has not yet
+been validated for use in multi-threaded code. This is a more general
+issue; see CSTS-110 @cite{Make sure all OpenACC entry points in
+libgomp are thread-safe}.
+
+The @code{acc_prof_lookup} interface is not implemented, and
+@code{acc_register_library} will receive @code{NULL} for its
+@code{lookup} parameter.
+
+Remarks about data provided to callbacks:
+
+@table @asis
+
+@item @code{acc_prof_info.event_type}
+It is not clear if for @emph{nested} event callbacks (for example,
+@code{acc_ev_enqueue_launch_start} as part of a parent compute
+construct), this should be set for the nested event
+(@code{acc_ev_enqueue_launch_start}), or if the value of the parent
+construct should remain (@code{acc_ev_compute_construct_start}). In
+this implementation, the value will generally correspond to the
+innermost nested event type.
+
+@item @code{acc_prof_info.device_type}
+@itemize
+
+@item
+For @code{acc_ev_compute_construct_start}, and in presence of an
+@code{if} clause with @emph{false} argument, this will still refer to
+the offloading device type; unsure whether that's the expected
+behavior.
+
+@item
+Complementary to the item before, for
+@code{acc_ev_compute_construct_end}, this is set to
+@code{acc_device_host} in presence of an @code{if} clause with
+@emph{false} argument, unsure whether that's the expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.thread_id}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.async}
+@itemize
+
+@item
+Not yet implemented correctly for
+@code{acc_ev_compute_construct_start}.
+
+@item
+In a compute construct, for host-fallback
+execution/@code{acc_device_host} it will always be
+@code{acc_async_sync}; unsure if that is the expected behavior.
+
+@item
+For @code{acc_ev_device_init_start} and @code{acc_ev_device_init_end},
+it will always be @code{acc_async_sync}; unsure if that is the
+expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.async_queue}
+There is no @cite{limited number of asynchronous queues} in libgomp.
+We define this to always have the same value as
+@code{acc_prof_info.async}.
+
+@item @code{acc_prof_info.src_file}, @code{acc_prof_info.func_name}, @code{acc_prof_info.line_no}
+If libbacktrace is available and functional (that is, @code{-g} debug
+information is available), these will be set accordingly for a lot of
+event types. Otherwise, these will be set to @code{NULL}
+(@code{acc_prof_info.src_file}, @code{acc_prof_info.func_name}), or
+@code{-1} (@code{acc_prof_info.line_no}), respectively.
+
+@item @code{acc_prof_info.end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_event_info.event_type}, @code{acc_event_info.*.event_type}
+Relating to @code{acc_prof_info.event_type} discussed above, in this
+implementation, this will always be the same value as
+@code{acc_prof_info.event_type}.
+
+@item @code{acc_event_info.*.parent_construct}
+@itemize
+
+@item
+Will be @code{acc_construct_parallel} for OpenACC kernels constructs;
+should be @code{acc_construct_kernels}.
+
+@item
+Will be @code{acc_construct_enter_data} or
+@code{acc_construct_exit_data} when processing variable mappings
+specified in OpenACC declare directives; should be
+@code{acc_construct_declare}.
+
+@item
+For implicit @code{acc_ev_device_init_start},
+@code{acc_ev_device_init_end}, and explicit as well as implicit
+@code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start},
+@code{acc_ev_enqueue_download_end}, @code{acc_ev_wait_start}, and
+@code{acc_ev_wait_end}, will be
+@code{acc_construct_parallel}; should reflect the real parent
+construct.
+
+@end itemize
+
+@item @code{acc_event_info.*.implicit}
+For @code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start},
+@code{acc_ev_enqueue_download_end}, @code{acc_ev_wait_start}, and
+@code{acc_ev_wait_end}, this currently will be @code{1}
+also for explicit usage.
+
+@item @code{acc_event_info.data_event.var_name}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_event_info.data_event.host_ptr}
+For @code{acc_ev_alloc}, and @code{acc_ev_free}, this is always
+@code{NULL}.
+
+@item @code{typedef union acc_api_info}
+@dots{} as printed in @cite{5.2.3. Third Argument: API-Specific
+Information}, should obviously be @code{typedef @emph{struct}
+acc_api_info}.
+
+@item @code{acc_api_info.device_api}
+Possibly not yet implemented correctly for
+@code{acc_ev_compute_construct_start},
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}:
+will always be @code{acc_device_api_none} for these event types.
+For @code{acc_ev_enter_data_start}, it will be
+@code{acc_device_api_none} in some cases.
+
+@item @code{acc_api_info.device_type}
+Always the same as @code{acc_prof_info.device_type}.
+
+@item @code{acc_api_info.vendor}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_api_info.device_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.context_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.async_handle}
+Always @code{NULL}; not yet implemented.
+
+@end table
+
+Remarks about certain event types:
+
+@table @asis
+
+@item @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+@itemize
+
+@item
+@c See DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT in
+@c libgomp.oacc-c-c++-common/acc_prof-parallel-1.c.
+Whan a compute construct triggers implicit
+@code{acc_ev_device_init_start} and @code{acc_ev_device_init_end}
+events, they currently aren't @emph{nested within} the corresponding
+@code{acc_ev_compute_construct_start} and
+@code{acc_ev_compute_construct_end}, but they're currently observed
+@emph{before} @code{acc_ev_compute_construct_start}. It is not clear
+what to do: the standard asks us provide a lot of details to the
+@code{acc_ev_compute_construct_start} callback, without (implicitly)
+initializing a device before?
+
+@item
+Callbacks for these event types will not be invoked for calls to the
+@code{acc_set_device_type} and @code{acc_set_device_num} functions;
+it's not clear if they should be.
+
+@end itemize
+
+@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}, @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
+@itemize
+
+@item
+Callbacks for these event types will also be invoked for OpenACC
+host_data constructs; it's not clear if they should be.
+
+@item
+Callbacks for these event types will also be invoked when processing
+variable mappings specified in OpenACC declare directives; it's not
+clear if they should be.
+
+@end itemize
+
+@end table
+
+Callbacks for the following event types will be invoked, but dispatch
+and information provided therein has not yet been thoroughly reviewed:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
+@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}
+@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end}
+@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
+@end itemize
+
+During device initialization, and finalization, respectively,
+callbacks for the following event types will not yet be invoked:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@end itemize
+
+Callbacks for the following event types have not yet been implemented,
+so currently won't be invoked:
+
+@itemize
+@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end}
+@item @code{acc_ev_runtime_shutdown}
+@item @code{acc_ev_create}, @code{acc_ev_delete}
+@end itemize
+
+For the following runtime library functions, not all expected
+callbacks will be invoked (mostly concerning implicit device
+initialization):
+
+@itemize
+@item @code{acc_get_num_devices}
+@item @code{acc_set_device_type}
+@item @code{acc_get_device_type}
+@item @code{acc_set_device_num}
+@item @code{acc_get_device_num}
+@item @code{acc_init}
+@item @code{acc_shutdown}
+@end itemize
+
+Aside from implicit device initialization, for the following runtime
+library functions, no callbacks will be invoked for shared-memory
+offloading devices (it's not clear if they should be):
+
+@itemize
+@item @code{acc_malloc}
+@item @code{acc_free}
+@item @code{acc_copyin}, @code{acc_present_or_copyin}, @code{acc_copyin_async}
+@item @code{acc_create}, @code{acc_present_or_create}, @code{acc_create_async}
+@item @code{acc_copyout}, @code{acc_copyout_async}, @code{acc_copyout_finalize}, @code{acc_copyout_finalize_async}
+@item @code{acc_delete}, @code{acc_delete_async}, @code{acc_delete_finalize}, @code{acc_delete_finalize_async}
+@item @code{acc_update_device}, @code{acc_update_device_async}
+@item @code{acc_update_self}, @code{acc_update_self_async}
+@item @code{acc_map_data}, @code{acc_unmap_data}
+@item @code{acc_memcpy_to_device}, @code{acc_memcpy_to_device_async}
+@item @code{acc_memcpy_from_device}, @code{acc_memcpy_from_device_async}
+@end itemize
+
+
+
@c ---------------------------------------------------------------------
@c The libgomp ABI
@c ---------------------------------------------------------------------
@@ -117,9 +117,26 @@ acc_async_test (int async)
if (!thr || !thr->dev)
gomp_fatal ("no device active");
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
int res = thr->dev->openacc.async.test_func (aq);
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
return res;
}
@@ -130,6 +147,12 @@ acc_async_test_all (void)
if (!thr || !thr->dev)
gomp_fatal ("no device active");
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
int ret = 1;
gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
@@ -140,6 +163,11 @@ acc_async_test_all (void)
}
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
return ret;
}
@@ -151,11 +179,28 @@ acc_wait (int async)
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
thr->dev->openacc.async.synchronize_func (aq);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
/* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */
@@ -174,6 +219,17 @@ acc_wait_async (int async1, int async2)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async2;
+ prof_info.async_queue = prof_info.async;
+ }
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
@@ -186,6 +242,12 @@ acc_wait_async (int async1, int async2)
thr->dev->openacc.async.synchronize_func (aq1);
thr->dev->openacc.async.serialize_func (aq1, aq2);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -193,6 +255,12 @@ acc_wait_all (void)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
@@ -202,6 +270,12 @@ acc_wait_all (void)
for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
dev->openacc.async.synchronize_func (l->aq);
gomp_mutex_unlock (&dev->openacc.async.lock);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
/* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all. */
@@ -223,6 +297,17 @@ acc_wait_all_async (int async)
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
@@ -236,11 +321,20 @@ acc_wait_all_async (int async)
thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
}
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
int
acc_get_default_async (void)
{
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
@@ -252,6 +346,9 @@ acc_get_default_async (void)
void
acc_set_default_async (int async)
{
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
@@ -36,10 +36,23 @@ acc_get_current_cuda_device (void)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
+ void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
- return thr->dev->openacc.cuda.get_current_device_func ();
+ ret = thr->dev->openacc.cuda.get_current_device_func ();
- return NULL;
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
+ return ret;
}
void *
@@ -47,10 +60,23 @@ acc_get_current_cuda_context (void)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
+ void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
- return thr->dev->openacc.cuda.get_current_context_func ();
-
- return NULL;
+ ret = thr->dev->openacc.cuda.get_current_context_func ();
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
+ return ret;
}
void *
@@ -61,6 +87,17 @@ acc_get_cuda_stream (int async)
if (async < 0)
return NULL;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
{
@@ -69,6 +106,12 @@ acc_get_cuda_stream (int async)
ret = thr->dev->openacc.cuda.get_stream_func (aq);
}
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
return ret;
}
@@ -80,10 +123,21 @@ acc_set_cuda_stream (int async, void *stream)
if (async < 0 || stream == NULL)
return 0;
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
int ret = -1;
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
{
@@ -93,5 +147,11 @@ acc_set_cuda_stream (int async, void *stream)
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
}
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
return ret;
}
@@ -230,13 +230,68 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
held before calling this function. */
static struct gomp_device_descr *
-acc_init_1 (acc_device_t d)
+acc_init_1 (struct goacc_thread *thr, acc_device_t d,
+ acc_construct_t parent_construct, int implicit,
+ int acc_prof_locinfo_skip)
{
gomp_mutex_lock (&acc_init_state_lock);
acc_init_state = initializing;
acc_init_thread = pthread_self ();
gomp_mutex_unlock (&acc_init_state_lock);
+ bool check_not_nested_p;
+ if (implicit)
+ {
+ /* In the implicit case, there should (must?) already be something
+ have been set up for an outer construct. */
+ check_not_nested_p = false;
+ }
+ else
+ {
+ check_not_nested_p = true;
+ }
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (check_not_nested_p),
+ false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_device_init_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = d;
+ prof_info.device_number = goacc_device_num;
+ prof_info.thread_id = -1;
+ prof_info.async = acc_async_sync;
+ prof_info.async_queue = prof_info.async;
+ goacc_profiling_locinfo_fill (thr, &prof_info, acc_prof_locinfo_skip + 1);
+ }
+ acc_event_info device_init_event_info;
+ if (profiling_dispatch_p)
+ {
+ device_init_event_info.other_event.event_type = prof_info.event_type;
+ device_init_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ device_init_event_info.other_event.parent_construct = parent_construct;
+ device_init_event_info.other_event.implicit = implicit;
+ device_init_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1;
+ api_info.device_handle = NULL;
+ api_info.context_handle = NULL;
+ api_info.async_handle = NULL;
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
+
struct gomp_device_descr *base_dev, *acc_dev;
int ndevs;
@@ -259,6 +314,14 @@ acc_init_1 (acc_device_t d)
gomp_init_device (acc_dev);
gomp_mutex_unlock (&acc_dev->lock);
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_device_init_end;
+ device_init_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &device_init_event_info,
+ &api_info);
+ }
+
gomp_mutex_lock (&acc_init_state_lock);
acc_init_state = initialized;
gomp_mutex_unlock (&acc_init_state_lock);
@@ -454,7 +517,12 @@ goacc_attach_host_thread_to_device (int ord)
thr->dev = acc_dev = &base_dev[ord];
thr->saved_bound_dev = NULL;
thr->mapped_data = NULL;
-
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ /* Initially, all callbacks for all events are enabled. */
+ thr->prof_callbacks_enabled = true;
+ thr->backtrace_state = NULL;
+
thr->target_tls
= acc_dev->openacc.create_thread_data_func (ord);
@@ -470,9 +538,7 @@ acc_init (acc_device_t d)
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
-
- cached_base_dev = acc_init_1 (d);
-
+ cached_base_dev = acc_init_1 (NULL, d, acc_construct_runtime_api, 0, 1);
gomp_mutex_unlock (&acc_device_lock);
goacc_attach_host_thread_to_device (-1);
@@ -531,6 +597,14 @@ acc_set_device_type (acc_device_t d)
struct gomp_device_descr *base_dev, *acc_dev;
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ prof_info.device_type = d;
+
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
@@ -555,6 +629,12 @@ acc_set_device_type (acc_device_t d)
}
goacc_attach_host_thread_to_device (-1);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
ialias (acc_set_device_type)
@@ -590,12 +670,24 @@ acc_get_device_type (void)
;
else
{
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
dev = resolve_device (acc_device_default, true);
gomp_mutex_unlock (&acc_device_lock);
res = acc_device_type (dev->type);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
assert (res != acc_device_default
@@ -612,6 +704,14 @@ acc_get_device_num (acc_device_t d)
const struct gomp_device_descr *dev;
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ prof_info.device_type = d;
+
if (d >= _ACC_device_hwm)
gomp_fatal ("unknown device type %u", (unsigned) d);
@@ -621,6 +721,12 @@ acc_get_device_num (acc_device_t d)
dev = resolve_device (d, true);
gomp_mutex_unlock (&acc_device_lock);
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
if (thr && thr->base_dev == dev && thr->dev)
return thr->dev->target_id;
@@ -735,15 +841,20 @@ goacc_restore_bind (void)
On exit "goacc_thread" will return a valid & populated thread block. */
attribute_hidden void
-goacc_lazy_initialize (void)
+goacc_lazy_initialize (int acc_prof_locinfo_skip)
{
struct goacc_thread *thr = goacc_thread ();
-
if (thr && thr->dev)
return;
+ gomp_init_targets_once ();
+
+ gomp_mutex_lock (&acc_device_lock);
if (!cached_base_dev)
- acc_init (acc_device_default);
- else
- goacc_attach_host_thread_to_device (-1);
+ cached_base_dev = acc_init_1 (thr, acc_device_default,
+ acc_construct_parallel, 1,
+ acc_prof_locinfo_skip + 1);
+ gomp_mutex_unlock (&acc_device_lock);
+
+ goacc_attach_host_thread_to_device (-1);
}
@@ -40,6 +40,8 @@
#include "openacc.h"
#include "config.h"
+#include "acc_prof.h"
+#include "backtrace.h"
#include <stddef.h>
#include <stdbool.h>
#include <stdarg.h>
@@ -68,6 +70,14 @@ struct goacc_thread
strictly push/pop semantics according to lexical scope. */
struct target_mem_desc *mapped_data;
+ /* Data of the OpenACC Profiling Interface. */
+ acc_prof_info *prof_info;
+ acc_api_info *api_info;
+ /* Per-thread toggle of OpenACC Profiling Interface callbacks. */
+ bool prof_callbacks_enabled;
+ /* Per-thread state of libbacktrace. */
+ struct backtrace_state *backtrace_state;
+
/* These structures form a list: this is the next thread in that list. */
struct goacc_thread *next;
@@ -99,7 +109,7 @@ void goacc_attach_host_thread_to_device (int);
void goacc_runtime_initialize (void);
void goacc_save_and_set_bind (acc_device_t);
void goacc_restore_bind (void);
-void goacc_lazy_initialize (void);
+void goacc_lazy_initialize (int);
void goacc_host_init (void);
void goacc_init_asyncqueues (struct gomp_device_descr *);
@@ -111,6 +121,16 @@ void goacc_async_free (struct gomp_device_descr *,
struct goacc_asyncqueue *get_goacc_asyncqueue (int);
struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool, int);
+void goacc_profiling_initialize (void);
+bool goacc_profiling_setup_p (struct goacc_thread *,
+ acc_prof_info *, acc_api_info *, int);
+bool goacc_profiling_dispatch_p (bool);
+void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
+ acc_api_info *);
+void goacc_profiling_locinfo_initialize ();
+void goacc_profiling_locinfo_fill (struct goacc_thread *,
+ acc_prof_info *, int);
+
#ifdef HAVE_ATTRIBUTE_VISIBILITY
# pragma GCC visibility pop
#endif
@@ -99,16 +99,31 @@ acc_malloc (size_t s)
if (!s)
return NULL;
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
assert (thr->dev);
+ void *ret;
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
- return malloc (s);
+ ret = malloc (s);
+ else
+ ret = thr->dev->alloc_func (thr->dev->target_id, s);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
- return thr->dev->alloc_func (thr->dev->target_id, s);
+ return ret;
}
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -124,12 +139,22 @@ acc_free (void *d)
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
assert (thr && thr->dev);
struct gomp_device_descr *acc_dev = thr->dev;
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
- return free (d);
+ {
+ free (d);
+
+ goto out;
+ }
gomp_mutex_lock (&acc_dev->lock);
@@ -151,16 +176,35 @@ acc_free (void *d)
if (!acc_dev->free_func (acc_dev->target_id, d))
gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
+
+ out:
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
static void
memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
- const char *libfnname)
+ const char *libfnname, int acc_prof_locinfo_skip)
{
/* No need to call lazy open here, as the device pointer must have
been obtained from a routine that did that. */
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info,
+ acc_prof_locinfo_skip + 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
assert (thr && thr->dev);
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
@@ -169,7 +213,8 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
memmove (h, d, s);
else
memmove (d, h, s);
- return;
+
+ goto out;
}
goacc_aq aq = get_goacc_asyncqueue (async);
@@ -177,30 +222,37 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
gomp_copy_dev2host (thr->dev, aq, h, d, s);
else
gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+
+ out:
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
acc_memcpy_to_device (void *d, void *h, size_t s)
{
- memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
+ memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__, 1);
}
void
acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
{
- memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
+ memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__, 1);
}
void
acc_memcpy_from_device (void *h, void *d, size_t s)
{
- memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
+ memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__, 1);
}
void
acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
{
- memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
+ memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__, 1);
}
/* Return the device pointer that corresponds to host data H. Or NULL
@@ -213,7 +265,7 @@ acc_deviceptr (void *h)
void *d;
void *offset;
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *dev = thr->dev;
@@ -221,6 +273,9 @@ acc_deviceptr (void *h)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h;
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
gomp_mutex_lock (&dev->lock);
n = lookup_host (dev, h, 1);
@@ -250,7 +305,7 @@ acc_hostptr (void *d)
void *h;
void *offset;
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
@@ -258,6 +313,9 @@ acc_hostptr (void *d)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return d;
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
gomp_mutex_lock (&acc_dev->lock);
n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
@@ -287,7 +345,7 @@ acc_is_present (void *h, size_t s)
if (!s || !h)
return 0;
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
@@ -295,6 +353,9 @@ acc_is_present (void *h, size_t s)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h != NULL;
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@@ -321,11 +382,17 @@ acc_map_data (void *h, void *d, size_t s)
size_t sizes = s;
unsigned short kinds = GOMP_MAP_ALLOC;
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
{
if (d != h)
@@ -366,6 +433,12 @@ acc_map_data (void *h, void *d, size_t s)
tgt->prev = acc_dev->openacc.data_environ;
acc_dev->openacc.data_environ = tgt;
gomp_mutex_unlock (&acc_dev->lock);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -380,6 +453,12 @@ acc_unmap_data (void *h)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+
size_t host_size;
gomp_mutex_lock (&acc_dev->lock);
@@ -433,6 +512,12 @@ acc_unmap_data (void *h)
gomp_mutex_unlock (&acc_dev->lock);
gomp_unmap_vars (t, true);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
#define FLAG_PRESENT (1 << 0)
@@ -440,7 +525,8 @@ acc_unmap_data (void *h)
#define FLAG_COPY (1 << 2)
static void *
-present_create_copy (unsigned f, void *h, size_t s, int async)
+present_create_copy (unsigned f, void *h, size_t s, int async,
+ int acc_prof_locinfo_skip)
{
void *d;
splay_tree_key n;
@@ -448,7 +534,7 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
if (!h || !s)
gomp_fatal ("[%p,+%d] is a bad range", (void *)h, (int)s);
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
@@ -456,6 +542,18 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info,
+ acc_prof_locinfo_skip + 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@@ -518,19 +616,26 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
gomp_mutex_unlock (&acc_dev->lock);
}
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
return d;
}
void *
acc_create (void *h, size_t s)
{
- return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync);
+ return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync,
+ 1);
}
void
acc_create_async (void *h, size_t s, int async)
{
- present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async);
+ present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async, 1);
}
/* acc_present_or_create used to be what acc_create is now. */
@@ -556,13 +661,13 @@ void *
acc_copyin (void *h, size_t s)
{
return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s,
- acc_async_sync);
+ acc_async_sync, 1);
}
void
acc_copyin_async (void *h, size_t s, int async)
{
- present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async);
+ present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async, 1);
}
/* acc_present_or_copyin used to be what acc_copyin is now. */
@@ -588,7 +693,8 @@ acc_pcopyin (void *h, size_t s)
#define FLAG_FINALIZE (1 << 1)
static void
-delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
+delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname,
+ int acc_prof_locinfo_skip)
{
size_t host_size;
splay_tree_key n;
@@ -599,6 +705,18 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info,
+ acc_prof_locinfo_skip + 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@@ -672,64 +790,71 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
}
gomp_mutex_unlock (&acc_dev->lock);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
acc_delete (void *h , size_t s)
{
- delete_copyout (0, h, s, acc_async_sync, __FUNCTION__);
+ delete_copyout (0, h, s, acc_async_sync, __FUNCTION__, 1);
}
void
acc_delete_async (void *h , size_t s, int async)
{
- delete_copyout (0, h, s, async, __FUNCTION__);
+ delete_copyout (0, h, s, async, __FUNCTION__, 1);
}
void
acc_delete_finalize (void *h , size_t s)
{
- delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__);
+ delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__, 1);
}
void
acc_delete_finalize_async (void *h , size_t s, int async)
{
- delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__);
+ delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__, 1);
}
void
acc_copyout (void *h, size_t s)
{
- delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__);
+ delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__, 1);
}
void
acc_copyout_async (void *h, size_t s, int async)
{
- delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__);
+ delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__, 1);
}
void
acc_copyout_finalize (void *h, size_t s)
{
delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync,
- __FUNCTION__);
+ __FUNCTION__, 1);
}
void
acc_copyout_finalize_async (void *h, size_t s, int async)
{
- delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__);
+ delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__, 1);
}
static void
-update_dev_host (int is_dev, void *h, size_t s, int async)
+update_dev_host (int is_dev, void *h, size_t s, int async,
+ int acc_prof_locinfo_skip)
{
splay_tree_key n;
void *d;
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (acc_prof_locinfo_skip + 1);
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
@@ -739,6 +864,18 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
gomp_mutex_lock (&acc_dev->lock);
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info,
+ acc_prof_locinfo_skip + 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
n = lookup_host (acc_dev, h, s);
if (!n)
@@ -758,30 +895,36 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
gomp_copy_dev2host (acc_dev, aq, h, d, s);
gomp_mutex_unlock (&acc_dev->lock);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
acc_update_device (void *h, size_t s)
{
- update_dev_host (1, h, s, acc_async_sync);
+ update_dev_host (1, h, s, acc_async_sync, 1);
}
void
acc_update_device_async (void *h, size_t s, int async)
{
- update_dev_host (1, h, s, async);
+ update_dev_host (1, h, s, async, 1);
}
void
acc_update_self (void *h, size_t s)
{
- update_dev_host (0, h, s, acc_async_sync);
+ update_dev_host (0, h, s, acc_async_sync, 1);
}
void
acc_update_self_async (void *h, size_t s, int async)
{
- update_dev_host (0, h, s, async);
+ update_dev_host (0, h, s, async, 1);
}
void
@@ -145,7 +145,8 @@ goacc_call_host_fn (void (*fn) (void *), size_t mapnum, void **hostaddrs,
static void
GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
size_t mapnum, void **hostaddrs, size_t *sizes,
- unsigned short *kinds, va_list *ap)
+ unsigned short *kinds, va_list *ap,
+ int acc_prof_locinfo_skip)
{
bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
struct goacc_thread *thr;
@@ -167,27 +168,79 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p\n",
__FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds);
#endif
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (acc_prof_locinfo_skip + 1);
thr = goacc_thread ();
acc_dev = thr->dev;
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type = acc_ev_compute_construct_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1;
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ goacc_profiling_locinfo_fill (thr, &prof_info, acc_prof_locinfo_skip + 1);
+ }
+ acc_event_info compute_construct_event_info;
+ if (profiling_dispatch_p)
+ {
+ compute_construct_event_info.other_event.event_type
+ = prof_info.event_type;
+ compute_construct_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ compute_construct_event_info.other_event.parent_construct
+ = acc_construct_parallel;
+ compute_construct_event_info.other_event.implicit = 0;
+ compute_construct_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1;
+ api_info.device_handle = NULL;
+ api_info.context_handle = NULL;
+ api_info.async_handle = NULL;
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+ &api_info);
+
handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
/* Host fallback if "if" clause is false or if the current device is set to
the host. */
if (host_fallback)
{
+ prof_info.device_type = acc_device_host;
+ api_info.device_type = prof_info.device_type;
goacc_save_and_set_bind (acc_device_host);
goacc_call_host_fn (fn, mapnum, hostaddrs, params);
goacc_restore_bind ();
- return;
+ goto out;
}
else if (acc_device_type (acc_dev->type) == acc_device_host)
{
goacc_call_host_fn (fn, mapnum, hostaddrs, params);
- return;
+ goto out;
}
+ else if (profiling_dispatch_p)
+ api_info.device_api = acc_device_api_cuda;
/* Default: let the runtime choose. */
for (i = 0; i != GOMP_DIM_MAX; i++)
@@ -219,6 +272,13 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
if (async == GOMP_LAUNCH_OP_MAX)
async = va_arg (*ap, unsigned);
+
+ if (profiling_dispatch_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
break;
}
@@ -257,10 +317,34 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
else
tgt_fn = (void (*)) fn;
+ acc_event_info enter_exit_data_event_info;
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_enter_data_start;
+ enter_exit_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ enter_exit_data_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ enter_exit_data_event_info.other_event.parent_construct
+ = compute_construct_event_info.other_event.parent_construct;
+ enter_exit_data_event_info.other_event.implicit = 1;
+ enter_exit_data_event_info.other_event.tool_info = NULL;
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+ }
+
goacc_aq aq = get_goacc_asyncqueue (async);
tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
true, GOMP_MAP_VARS_OPENACC);
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_enter_data_end;
+ enter_exit_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+ }
devaddrs = gomp_alloca (sizeof (void *) * mapnum);
for (i = 0; i < mapnum; i++)
@@ -281,8 +365,25 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
else
acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
dims, tgt);
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_exit_data_start;
+ enter_exit_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ enter_exit_data_event_info.other_event.tool_info = NULL;
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+ }
/* If running synchronously, unmap immediately. */
gomp_unmap_vars (tgt, true);
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_exit_data_end;
+ enter_exit_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+ }
}
else
{
@@ -294,6 +395,19 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
devaddrs, dims, tgt, aq);
goacc_async_copyout_unmap_vars (tgt, aq);
}
+
+ out:
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_compute_construct_end;
+ compute_construct_event_info.other_event.event_type
+ = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+ &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -304,7 +418,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
va_list ap;
va_start (ap, kinds);
GOACC_parallel_keyed_internal (device, 0, fn, mapnum, hostaddrs, sizes,
- kinds, &ap);
+ kinds, &ap, 1);
va_end (ap);
}
@@ -316,7 +430,7 @@ GOACC_parallel_keyed_v2 (int device, int args, void (*fn) (void *),
va_list ap;
va_start (ap, kinds);
GOACC_parallel_keyed_internal (device, args, fn, mapnum, hostaddrs, sizes,
- kinds, &ap);
+ kinds, &ap, 1);
va_end (ap);
}
@@ -349,23 +463,84 @@ GOACC_data_start (int device, size_t mapnum,
__FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds);
#endif
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type = acc_ev_enter_data_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1;
+ prof_info.async = acc_async_sync; /* Always synchronous. */
+ prof_info.async_queue = prof_info.async;
+ goacc_profiling_locinfo_fill (thr, &prof_info, 1);
+ }
+ acc_event_info enter_data_event_info;
+ if (profiling_dispatch_p)
+ {
+ enter_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ enter_data_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ enter_data_event_info.other_event.parent_construct = acc_construct_data;
+ for (int i = 0; i < mapnum; ++i)
+ if (kinds[i] == GOMP_MAP_USE_DEVICE_PTR)
+ {
+ /* If there is one such data mapping kind, then this is actually an
+ OpenACC host_data construct. (GCC maps the OpenACC host_data
+ construct to the OpenACC data construct.) Apart from artificial
+ test cases (such as an OpenACC host_data construct's (implicit)
+ device initialization when there hasn't been any device data be
+ set up before...), there can't really any meaningful events be
+ generated from OpenACC host_data constructs, though. */
+ enter_data_event_info.other_event.parent_construct
+ = acc_construct_host_data;
+ break;
+ }
+ enter_data_event_info.other_event.implicit = 0;
+ enter_data_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1;
+ api_info.device_handle = NULL;
+ api_info.context_handle = NULL;
+ api_info.async_handle = NULL;
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
/* Host fallback or 'do nothing'. */
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| host_fallback)
{
+ prof_info.device_type = acc_device_host;
+ api_info.device_type = prof_info.device_type;
tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
GOMP_MAP_VARS_OPENACC);
tgt->prev = thr->mapped_data;
thr->mapped_data = tgt;
-
- return;
+ goto out;
}
gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__);
@@ -374,18 +549,86 @@ GOACC_data_start (int device, size_t mapnum,
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
tgt->prev = thr->mapped_data;
thr->mapped_data = tgt;
+
+ out:
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_enter_data_end;
+ enter_data_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
GOACC_data_end (void)
{
struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
struct target_mem_desc *tgt = thr->mapped_data;
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type = acc_ev_exit_data_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1;
+ prof_info.async = acc_async_sync; /* Always synchronous. */
+ prof_info.async_queue = prof_info.async;
+ goacc_profiling_locinfo_fill (thr, &prof_info, 1);
+ }
+ acc_event_info exit_data_event_info;
+ if (profiling_dispatch_p)
+ {
+ exit_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ exit_data_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ exit_data_event_info.other_event.parent_construct = acc_construct_data;
+ exit_data_event_info.other_event.implicit = 0;
+ exit_data_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1;
+ api_info.device_handle = NULL;
+ api_info.context_handle = NULL;
+ api_info.async_handle = NULL;
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
gomp_debug (0, " %s: restore mappings\n", __FUNCTION__);
thr->mapped_data = tgt->prev;
gomp_unmap_vars (tgt, true);
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
+
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_exit_data_end;
+ exit_data_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -393,6 +636,9 @@ GOACC_enter_exit_data (int device, size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned short *kinds,
int async, int num_waits, ...)
{
+ struct goacc_thread *thr;
+ struct gomp_device_descr *acc_dev;
+ bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
bool data_enter = false;
size_t i;
@@ -437,7 +683,67 @@ GOACC_enter_exit_data (int device, size_t mapnum,
kind);
}
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
+
+ thr = goacc_thread ();
+ acc_dev = thr->dev;
+
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type
+ = data_enter ? acc_ev_enter_data_start : acc_ev_exit_data_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1;
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ goacc_profiling_locinfo_fill (thr, &prof_info, 1);
+ }
+ acc_event_info enter_exit_data_event_info;
+ if (profiling_dispatch_p)
+ {
+ enter_exit_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ enter_exit_data_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ enter_exit_data_event_info.other_event.parent_construct
+ = data_enter ? acc_construct_enter_data : acc_construct_exit_data;
+ enter_exit_data_event_info.other_event.implicit = 0;
+ enter_exit_data_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1;
+ api_info.device_handle = NULL;
+ api_info.context_handle = NULL;
+ api_info.async_handle = NULL;
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+
+ if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ || host_fallback)
+ {
+ prof_info.device_type = acc_device_host;
+ api_info.device_type = prof_info.device_type;
+ goto out;
+ }
if (num_waits > 0)
{
@@ -558,6 +864,18 @@ GOACC_enter_exit_data (int device, size_t mapnum,
i += pointer - 1;
}
}
+
+ out:
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = data_enter ? acc_ev_enter_data_end: acc_ev_exit_data_end;
+ enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
static void
@@ -596,14 +914,64 @@ GOACC_update (int device, size_t mapnum,
bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
size_t i;
- goacc_lazy_initialize ();
+ goacc_lazy_initialize (1);
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type = acc_ev_update_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1;
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ goacc_profiling_locinfo_fill (thr, &prof_info, 1);
+ }
+ acc_event_info update_event_info;
+ if (profiling_dispatch_p)
+ {
+ update_event_info.other_event.event_type
+ = prof_info.event_type;
+ update_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ update_event_info.other_event.parent_construct = acc_construct_update;
+ update_event_info.other_event.implicit = 0;
+ update_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1;
+ api_info.device_handle = NULL;
+ api_info.context_handle = NULL;
+ api_info.async_handle = NULL;
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| host_fallback)
- return;
+ {
+ prof_info.device_type = acc_device_host;
+ api_info.device_type = prof_info.device_type;
+ goto out;
+ }
if (num_waits > 0)
{
@@ -675,11 +1043,40 @@ GOACC_update (int device, size_t mapnum,
break;
}
}
+
+ out:
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_update_end;
+ update_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
GOACC_wait (int async, int num_waits, ...)
{
+ goacc_lazy_initialize (1);
+
+ struct goacc_thread *thr = goacc_thread ();
+
+ /* No nesting. */
+ assert (thr->prof_info == NULL);
+ assert (thr->api_info == NULL);
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ prof_info.async_queue = prof_info.async;
+ }
+
if (num_waits)
{
va_list ap;
@@ -692,6 +1089,12 @@ GOACC_wait (int async, int num_waits, ...)
acc_wait_all ();
else if (async == acc_async_noval)
acc_wait_all_async (async);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
int
@@ -39,6 +39,19 @@ GOMP_PLUGIN_acc_thread (void)
return thr ? thr->target_tls : NULL;
}
+/* Return the TLS data for the current thread. */
+/* TODO. Should we be able to directly call (the static inline function)
+ goacc_thread from within plugin code? I didn't manage to get the
+ "goacc_tls_data" symbol configured correctly: "[...]/ld:
+ .libs/libgomp-plugin-nvptx.so.1.0.0: hidden symbol `goacc_tls_data' isn't
+ defined". */
+
+struct goacc_thread *
+GOMP_PLUGIN_goacc_thread (void)
+{
+ return goacc_thread ();
+}
+
/* Return the default async number from the TLS data for the current thread. */
int
@@ -27,8 +27,11 @@
#ifndef OACC_PLUGIN_H
#define OACC_PLUGIN_H 1
+#include "oacc-int.h"
+
extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
extern void *GOMP_PLUGIN_acc_thread (void);
+extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
extern int GOMP_PLUGIN_acc_thread_default_async (void);
#endif
new file mode 100644
@@ -0,0 +1,39 @@
+/* Copyright (C) 2017 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file provides an stub acc_register_library function. It's in a
+ separate file so that this function can easily be overridden when linking
+ statically. */
+
+#include "libgomp.h"
+#include "acc_prof.h"
+
+void
+acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
+ acc_prof_lookup_func lookup)
+{
+ gomp_debug (0, "dummy %s\n", __FUNCTION__);
+}
new file mode 100644
@@ -0,0 +1,138 @@
+/* Copyright (C) 2018 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* OpenACC Profiling Interface: location information. */
+
+#include "libgomp.h"
+#include "oacc-int.h"
+#include "backtrace-supported.h"
+#ifdef HAVE_STRING_H
+# include <string.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+
+
+/* Initialize. */
+void
+goacc_profiling_locinfo_initialize ()
+{
+ if (!BACKTRACE_SUPPORTED)
+ gomp_debug (0, "libbacktrace not supported:"
+ " can't provide location information in"
+ " the OpenACC Profiling Interface\n");
+}
+
+/* Initialize for "thr" if not yet done. */
+static void
+goacc_profiling_locinfo_initialize_thr (struct goacc_thread *thr)
+{
+ if (__builtin_expect (thr->backtrace_state == NULL, false))
+ {
+ /* Separate state per thread, to avoid locking. */
+ thr->backtrace_state = backtrace_create_state (NULL, 0, NULL, NULL);
+ if (thr->backtrace_state == NULL)
+ gomp_fatal ("failed to create state information for libbacktrace");
+ }
+}
+
+static void
+error_callback (void *data, const char *msg, int errnum)
+{
+ if (errnum < 0)
+ gomp_debug (0, "could not generate backtrace: %s\n", msg);
+ else if (errnum == 0)
+ gomp_debug (0, "could not generate backtrace: %s\n", msg);
+ else
+ gomp_debug (0, "could not generate backtrace: %s: %s\n", msg, strerror (errnum));
+}
+
+static int
+full_callback (void *data, uintptr_t pc, const char *filename, int lineno, const char *function)
+{
+ gomp_debug (0, " libbacktrace found for PC '0x"
+#ifdef HAVE_INTTYPES_H
+ "%" PRIxPTR
+#else
+ "%lu"
+#endif
+ "': filename '%s', lineno '%d', function '%s'\n",
+#ifndef HAVE_INTTYPES_H
+ (unsigned long)
+#endif
+ pc,
+ filename ?: "NULL",
+ lineno,
+ function ?: "NULL");
+
+ acc_prof_info *prof_info = (acc_prof_info *) data;
+
+ /* "backtrace.h" states that "the FILENAME and FUNCTION buffers may become
+ invalid after this function returns". But given that OpenACC 2.5 states
+ that "if the library wants to save [these], it should allocate memory and
+ copy the string[s]", it is fine to pass the pointers here. */
+ prof_info->src_file = filename;
+ prof_info->func_name = function;
+ if (lineno > 0)
+ prof_info->line_no = lineno;
+ else
+ prof_info->line_no = -1;
+
+ /* Stop here; we got what we need. */
+ return 1;
+}
+
+void goacc_profiling_locinfo_fill (struct goacc_thread *thr,
+ acc_prof_info *prof_info, int skip)
+{
+ /* Default: all unknown. */
+ prof_info->src_file = NULL;
+ prof_info->func_name = NULL;
+ prof_info->line_no = -1;
+ prof_info->end_line_no = -1;
+ prof_info->func_line_no = -1;
+ prof_info->func_end_line_no = -1;
+
+ if (!BACKTRACE_SUPPORTED)
+ {
+ /* We diagnosed this in goacc_profiling_locinfo_initialize. */
+ return;
+ }
+
+ /* Special case for oacc-init.c:acc_init_1. */
+ /* See also oacc-profiling.c:goacc_profiling_setup_p. */
+ if (__builtin_expect (thr == NULL, false))
+ {
+ gomp_debug (0, "Can't look up location information for"
+ " the current call, construct, or directive\n");
+ return;
+ }
+
+ goacc_profiling_locinfo_initialize_thr (thr);
+
+ backtrace_full (thr->backtrace_state, skip, full_callback, error_callback, prof_info);
+}
new file mode 100644
@@ -0,0 +1,650 @@
+/* Copyright (C) 2017 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* OpenACC Profiling Interface. */
+
+#define _GNU_SOURCE
+#include "libgomp.h"
+#include "oacc-int.h"
+#include "secure_getenv.h"
+#include "acc_prof.h"
+#include <assert.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#endif
+#ifdef PLUGIN_SUPPORT
+# include <dlfcn.h>
+#endif
+
+#define STATIC_ASSERT(expr) _Static_assert (expr, "!(" #expr ")")
+
+/* Statically assert that the layout of the common fields in the
+ "acc_event_info" variants matches. */
+/* event_type */
+STATIC_ASSERT (offsetof (acc_event_info, event_type)
+ == offsetof (acc_event_info, data_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+ == offsetof (acc_event_info, launch_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+ == offsetof (acc_event_info, other_event.event_type));
+/* valid_bytes */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+ == offsetof (acc_event_info, launch_event.valid_bytes));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+ == offsetof (acc_event_info, other_event.valid_bytes));
+/* parent_construct */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+ == offsetof (acc_event_info, launch_event.parent_construct));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+ == offsetof (acc_event_info, other_event.parent_construct));
+/* implicit */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+ == offsetof (acc_event_info, launch_event.implicit));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+ == offsetof (acc_event_info, other_event.implicit));
+/* tool_info */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+ == offsetof (acc_event_info, launch_event.tool_info));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+ == offsetof (acc_event_info, other_event.tool_info));
+
+struct goacc_prof_callback_entry
+{
+ acc_prof_callback cb;
+ int ref;
+ bool enabled;
+ struct goacc_prof_callback_entry *next;
+};
+
+/* Using a separate flag to minimize run-time performance impact in the (very
+ common) case that profiling is not enabled. */
+static bool goacc_prof_enabled;
+/* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global toggle. */
+static bool goacc_prof_callbacks_enabled[acc_ev_last];
+static struct goacc_prof_callback_entry *goacc_prof_callback_entries[acc_ev_last];
+
+/* This lock is used to protect access to goacc_prof_callbacks_enabled, and
+ goacc_prof_callback_entries. */
+static gomp_mutex_t goacc_prof_lock;
+
+void
+goacc_profiling_initialize (void)
+{
+ gomp_mutex_init (&goacc_prof_lock);
+
+ /* Initially, all callbacks for all events are enabled. */
+ for (int i = 0; i < acc_ev_last; ++i)
+ goacc_prof_callbacks_enabled[i] = true;
+ /* ..., but profiling is still disabled. */
+ __atomic_store_n (&goacc_prof_enabled, false, MEMMODEL_RELAXED);
+
+ /* We are to invoke an external acc_register_library routine, defaulting to
+ our stub oacc-profiling-acc_register_library.c:acc_register_library
+ implementation. */
+ gomp_debug (0, "%s: calling acc_register_library\n", __FUNCTION__);
+ acc_register_library (acc_prof_register, acc_prof_unregister, NULL);
+#ifdef PLUGIN_SUPPORT
+ char *acc_proflibs = secure_getenv ("ACC_PROFLIB");
+ while (acc_proflibs != NULL && acc_proflibs[0] != '\0')
+ {
+ char *acc_proflibs_sep = strchr (acc_proflibs, ';');
+ char *acc_proflib;
+ if (acc_proflibs_sep == acc_proflibs)
+ {
+ /* Stray ";" separator: make sure we don't dlopen the main
+ program. */
+ acc_proflib = NULL;
+ }
+ else
+ {
+ if (acc_proflibs_sep != NULL)
+ {
+ /* Single out the first library. */
+ acc_proflib = gomp_malloc (acc_proflibs_sep - acc_proflibs + 1);
+ memcpy (acc_proflib, acc_proflibs,
+ acc_proflibs_sep - acc_proflibs);
+ acc_proflib[acc_proflibs_sep - acc_proflibs] = '\0';
+ }
+ else
+ {
+ /* No ";" separator, so only one library. */
+ acc_proflib = acc_proflibs;
+ }
+
+ gomp_debug (0, "%s: dlopen(%s)\n", __FUNCTION__, acc_proflib);
+ void *dl_handle = dlopen (acc_proflib, RTLD_LAZY);
+ if (dl_handle != NULL)
+ {
+ typeof (&acc_register_library) a_r_l
+ = dlsym (dl_handle, "acc_register_library");
+ if (a_r_l == NULL)
+ goto dl_fail;
+ /* Avoid duplicate registration, for example if the same shared
+ library is specified in LD_PRELOAD and ACC_PROFLIB -- which
+ TAU 2.26 does when using "tau_exec -openacc". */
+ if (a_r_l == acc_register_library)
+ gomp_debug (0, " %s: skipping duplicate"
+ " %s:acc_register_library\n",
+ __FUNCTION__, acc_proflib);
+ else
+ {
+ gomp_debug (0, " %s: calling %s:acc_register_library\n",
+ __FUNCTION__, acc_proflib);
+ a_r_l (acc_prof_register, acc_prof_unregister, NULL);
+ }
+ }
+ else
+ {
+ dl_fail:
+ gomp_error ("while loading ACC_PROFLIB %s: %s",
+ acc_proflib, dlerror ());
+ if (dl_handle != NULL)
+ {
+ int err = dlclose (dl_handle);
+ dl_handle = NULL;
+ if (err != 0)
+ goto dl_fail;
+ }
+ }
+ }
+
+ if (acc_proflib != acc_proflibs)
+ {
+ free (acc_proflib);
+
+ acc_proflibs = acc_proflibs_sep + 1;
+ }
+ else
+ acc_proflibs = NULL;
+ }
+#endif /* PLUGIN_SUPPORT */
+
+ goacc_profiling_locinfo_initialize ();
+}
+
+void
+acc_prof_register (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+ __atomic_store_n (&goacc_prof_enabled, true, MEMMODEL_RELAXED);
+
+ gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+ __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+ enum
+ {
+ EVENT_KIND_BOGUS,
+ EVENT_KIND_NORMAL,
+ /* As end events invoke callbacks in the reverse order, we register these
+ in the reverse order here. */
+ EVENT_KIND_END,
+ } event_kind = EVENT_KIND_BOGUS;
+ switch (ev)
+ {
+ case acc_ev_none:
+ case acc_ev_device_init_start:
+ case acc_ev_device_shutdown_start:
+ case acc_ev_runtime_shutdown:
+ case acc_ev_create:
+ case acc_ev_delete:
+ case acc_ev_alloc:
+ case acc_ev_free:
+ case acc_ev_enter_data_start:
+ case acc_ev_exit_data_start:
+ case acc_ev_update_start:
+ case acc_ev_compute_construct_start:
+ case acc_ev_enqueue_launch_start:
+ case acc_ev_enqueue_upload_start:
+ case acc_ev_enqueue_download_start:
+ case acc_ev_wait_start:
+ event_kind = EVENT_KIND_NORMAL;
+ break;
+ case acc_ev_device_init_end:
+ case acc_ev_device_shutdown_end:
+ case acc_ev_enter_data_end:
+ case acc_ev_exit_data_end:
+ case acc_ev_update_end:
+ case acc_ev_compute_construct_end:
+ case acc_ev_enqueue_launch_end:
+ case acc_ev_enqueue_upload_end:
+ case acc_ev_enqueue_download_end:
+ case acc_ev_wait_end:
+ event_kind = EVENT_KIND_END;
+ break;
+ case acc_ev_last:
+ break;
+ }
+ if (event_kind == EVENT_KIND_BOGUS)
+ {
+ gomp_error ("ignoring %s request for invalid acc_event_t %d",
+ __FUNCTION__, (int) ev);
+ return;
+ }
+
+ bool bogus = true;
+ switch (reg)
+ {
+ case acc_reg:
+ case acc_toggle:
+ case acc_toggle_per_thread:
+ bogus = false;
+ break;
+ }
+ if (bogus)
+ {
+ gomp_error ("ignoring %s request with invalid acc_register_t %d",
+ __FUNCTION__, (int) reg);
+ return;
+ }
+
+ /* Special cases. */
+ if (reg == acc_toggle)
+ {
+ if (cb == NULL)
+ {
+ gomp_debug (0, " globally enabling callbacks\n");
+ gomp_mutex_lock (&goacc_prof_lock);
+ /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global
+ toggle. */
+ goacc_prof_callbacks_enabled[ev] = true;
+ gomp_mutex_unlock (&goacc_prof_lock);
+ return;
+ }
+ else if (ev == acc_ev_none && cb != NULL)
+ {
+ gomp_debug (0, " ignoring request\n");
+ /* Silently ignore request. */
+ return;
+ }
+ }
+ else if (reg == acc_toggle_per_thread)
+ {
+ if (ev == acc_ev_none && cb == NULL)
+ {
+ gomp_debug (0, " thread: enabling callbacks\n");
+ goacc_lazy_initialize (1);
+ struct goacc_thread *thr = goacc_thread ();
+ thr->prof_callbacks_enabled = true;
+ return;
+ }
+ gomp_error ("ignoring %s request for acc_toggle_per_thread",
+ __FUNCTION__);
+ return;
+ }
+
+ gomp_mutex_lock (&goacc_prof_lock);
+
+ struct goacc_prof_callback_entry *it, *it_p;
+ it = goacc_prof_callback_entries[ev];
+ it_p = NULL;
+ while (it)
+ {
+ if (it->cb == cb)
+ break;
+ it_p = it;
+ it = it->next;
+ }
+
+ switch (reg)
+ {
+ case acc_reg:
+ /* If we already have this callback registered, just increment its ref
+ count. */
+ if (it != NULL)
+ {
+ it->ref++;
+ gomp_debug (0, " already registered;"
+ " incrementing ref count to: %d\n", it->ref);
+ }
+ else
+ {
+ struct goacc_prof_callback_entry *e
+ = gomp_malloc (sizeof (struct goacc_prof_callback_entry));
+ e->cb = cb;
+ e->ref = 1;
+ e->enabled = true;
+ bool prepend = (event_kind == EVENT_KIND_END);
+ /* If we don't have any callback registered yet, also use the
+ "prepend" code path. */
+ if (it_p == NULL)
+ prepend = true;
+ if (prepend)
+ {
+ gomp_debug (0, " prepending\n");
+ e->next = goacc_prof_callback_entries[ev];
+ goacc_prof_callback_entries[ev] = e;
+ }
+ else
+ {
+ gomp_debug (0, " appending\n");
+ e->next = NULL;
+ it_p->next = e;
+ }
+ }
+ break;
+
+ case acc_toggle:
+ if (it == NULL)
+ {
+ /* Silently ignore acc_toggle request if not registered. */
+ gomp_debug (0, " not enabling; not registered\n");
+ }
+ else
+ {
+ gomp_debug (0, " enabling\n");
+ it->enabled = true;
+ }
+ break;
+
+ case acc_toggle_per_thread:
+ __builtin_unreachable ();
+ }
+
+ gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+void
+acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+ gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+ __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+ if (ev < acc_ev_none
+ || ev >= acc_ev_last)
+ {
+ gomp_error ("ignoring %s request for invalid acc_event_t %d",
+ __FUNCTION__, (int) ev);
+ return;
+ }
+
+ bool bogus = true;
+ switch (reg)
+ {
+ case acc_reg:
+ case acc_toggle:
+ case acc_toggle_per_thread:
+ bogus = false;
+ break;
+ }
+ if (bogus)
+ {
+ gomp_error ("ignoring %s request with invalid acc_register_t %d",
+ __FUNCTION__, (int) reg);
+ return;
+ }
+
+ /* Special cases. */
+ if (reg == acc_toggle)
+ {
+ if (cb == NULL)
+ {
+ gomp_debug (0, " globally disabling callbacks\n");
+ gomp_mutex_lock (&goacc_prof_lock);
+ /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global
+ toggle. */
+ goacc_prof_callbacks_enabled[ev] = false;
+ gomp_mutex_unlock (&goacc_prof_lock);
+ return;
+ }
+ else if (ev == acc_ev_none && cb != NULL)
+ {
+ gomp_debug (0, " ignoring request\n");
+ /* Silently ignore request. */
+ return;
+ }
+ }
+ else if (reg == acc_toggle_per_thread)
+ {
+ if (ev == acc_ev_none && cb == NULL)
+ {
+ gomp_debug (0, " thread: disabling callbacks\n");
+ goacc_lazy_initialize (1);
+ struct goacc_thread *thr = goacc_thread ();
+ thr->prof_callbacks_enabled = false;
+ return;
+ }
+ gomp_error ("ignoring %s request for acc_toggle_per_thread",
+ __FUNCTION__);
+ return;
+ }
+
+ gomp_mutex_lock (&goacc_prof_lock);
+
+ struct goacc_prof_callback_entry *it, *it_p;
+ it = goacc_prof_callback_entries[ev];
+ it_p = NULL;
+ while (it)
+ {
+ if (it->cb == cb)
+ break;
+ it_p = it;
+ it = it->next;
+ }
+
+ switch (reg)
+ {
+ case acc_reg:
+ if (it == NULL)
+ {
+ gomp_error ("ignoring %s request for acc_event_t %d: not registered",
+ __FUNCTION__, (int) ev);
+ gomp_mutex_unlock (&goacc_prof_lock);
+ return;
+ }
+ it->ref--;
+ gomp_debug (0, " decrementing ref count to: %d\n", it->ref);
+ if (it->ref == 0)
+ {
+ if (it_p == NULL)
+ goacc_prof_callback_entries[ev] = it->next;
+ else
+ it_p->next = it->next;
+ free (it);
+ }
+ break;
+
+ case acc_toggle:
+ if (it == NULL)
+ {
+ /* Silently ignore acc_toggle request if not registered. */
+ gomp_debug (0, " not disabling; not registered\n");
+ }
+ else
+ {
+ gomp_debug (0, " disabling\n");
+ it->enabled = false;
+ }
+ break;
+
+ case acc_toggle_per_thread:
+ __builtin_unreachable ();
+ }
+
+ gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+/* Set up to dispatch events? */
+
+bool
+goacc_profiling_setup_p (struct goacc_thread *thr,
+ acc_prof_info *prof_info, acc_api_info *api_info,
+ int acc_prof_locinfo_skip)
+{
+ gomp_debug (0, "%s (%p)\n", __FUNCTION__, thr);
+
+ /* If we don't have any per-thread state yet, we can't register prof_info and
+ api_info. */
+ /* See also oacc-profiling-locinfo.c:goacc_profiling_locinfo_fill. */
+ if (__builtin_expect (thr == NULL, false))
+ {
+ gomp_debug (0, "Can't generate OpenACC Profiling Interface events for"
+ " the current call, construct, or directive\n");
+ return false;
+ }
+
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (false), false);
+ if (thr->prof_info != NULL)
+ {
+ assert (profiling_dispatch_p);
+ /* Profiling has already been set up for an outer construct. In this
+ case, we continue to use the existing information, and thus return
+ "false" here.
+
+ This can happen, for example, for an enter data directive, which sets
+ up profiling, then calls into acc_copyin, which should not again set
+ up profiling, should not overwrite the existing information. */
+ return false;
+ }
+
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = prof_info;
+
+ prof_info->event_type = -1; /* Must be set later. */
+ prof_info->valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info->version = _ACC_PROF_INFO_VERSION;
+ if (thr->dev)
+ {
+ prof_info->device_type = acc_device_type (thr->dev->type);
+ prof_info->device_number = thr->dev->target_id;
+ }
+ else
+ {
+ prof_info->device_type = -1;
+ prof_info->device_number = -1;
+ }
+ prof_info->thread_id = -1;
+ prof_info->async = acc_async_sync;
+ prof_info->async_queue = prof_info->async;
+ goacc_profiling_locinfo_fill (thr, prof_info, acc_prof_locinfo_skip + 1);
+
+ thr->api_info = api_info;
+
+ api_info->device_api = acc_device_api_none;
+ api_info->valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info->device_type = prof_info->device_type;
+ api_info->vendor = -1;
+ api_info->device_handle = NULL;
+ api_info->context_handle = NULL;
+ api_info->async_handle = NULL;
+ }
+
+ return profiling_dispatch_p;
+}
+
+/* Prepare to dispatch events? */
+
+bool
+goacc_profiling_dispatch_p (bool check_not_nested_p)
+{
+ if (__builtin_expect (__atomic_load_n (&goacc_prof_enabled,
+ MEMMODEL_RELAXED) != true, true))
+ return false;
+
+ gomp_debug (0, "%s\n", __FUNCTION__);
+
+ struct goacc_thread *thr = goacc_thread ();
+ if (__builtin_expect (thr == NULL, false))
+ {
+ /* If we don't have any per-thread state yet, that means that per-thread
+ callback dispatch has not been explicitly disabled (which only a call
+ to acc_prof_unregister with acc_toggle_per_thread will do, and that
+ would have allocated per-thread state via goacc_lazy_initialize);
+ initially, all callbacks for all events are enabled. */
+ gomp_debug (0, " %s: don't have any per-thread state yet\n", __FUNCTION__);
+ }
+ else
+ {
+ if (check_not_nested_p)
+ {
+ /* No nesting. */
+ assert (thr->prof_info == NULL);
+ assert (thr->api_info == NULL);
+ }
+
+ if (__builtin_expect (!thr->prof_callbacks_enabled, true))
+ {
+ gomp_debug (0, " %s: disabled for this thread\n", __FUNCTION__);
+ return false;
+ }
+ }
+
+ gomp_mutex_lock (&goacc_prof_lock);
+
+ /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global toggle. */
+ if (__builtin_expect (!goacc_prof_callbacks_enabled[acc_ev_none], true))
+ {
+ gomp_debug (0, " %s: disabled globally\n", __FUNCTION__);
+ gomp_mutex_unlock (&goacc_prof_lock);
+ return false;
+ }
+
+ gomp_mutex_unlock (&goacc_prof_lock);
+
+ return true;
+}
+
+/* Dispatch events.
+
+ This must only be called if goacc_profiling_dispatch_p returned a true
+ result. */
+
+void
+goacc_profiling_dispatch (acc_prof_info *prof_info, acc_event_info *event_info,
+ acc_api_info *apt_info)
+{
+ acc_event_t event_type = event_info->event_type;
+ gomp_debug (0, "%s: event_type=%d\n", __FUNCTION__, (int) event_type);
+ assert (event_type > acc_ev_none
+ && event_type < acc_ev_last);
+
+ gomp_mutex_lock (&goacc_prof_lock);
+
+ if (!goacc_prof_callbacks_enabled[event_type])
+ {
+ gomp_debug (0, " %s: disabled for this event type\n", __FUNCTION__);
+ gomp_mutex_unlock (&goacc_prof_lock);
+ return;
+ }
+
+ for (struct goacc_prof_callback_entry *e
+ = goacc_prof_callback_entries[event_type];
+ e != NULL;
+ e = e->next)
+ {
+ if (!e->enabled)
+ {
+ gomp_debug (0, " %s: disabled for callback %p\n",
+ __FUNCTION__, e->cb);
+ continue;
+ }
+
+ gomp_debug (0, " %s: calling callback %p\n", __FUNCTION__, e->cb);
+ e->cb (prof_info, event_info, apt_info);
+ }
+
+ gomp_mutex_unlock (&goacc_prof_lock);
+}
@@ -36,6 +36,7 @@
#include "libgomp-plugin.h"
#include "oacc-plugin.h"
#include "gomp-constants.h"
+#include "oacc-int.h"
#include <pthread.h>
#include <cuda.h>
@@ -861,11 +862,52 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
// num_workers ntid.y
// vector length ntid.x
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_event_info enqueue_launch_event_info;
+ acc_api_info *api_info = thr->api_info;
+ bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_launch_start;
+
+ enqueue_launch_event_info.launch_event.event_type
+ = prof_info->event_type;
+ enqueue_launch_event_info.launch_event.valid_bytes
+ = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
+ enqueue_launch_event_info.launch_event.parent_construct
+ = acc_construct_parallel;
+ enqueue_launch_event_info.launch_event.implicit = 1;
+ enqueue_launch_event_info.launch_event.tool_info = NULL;
+ enqueue_launch_event_info.launch_event.kernel_name
+ = targ_fn->launch->fn;
+ enqueue_launch_event_info.launch_event.num_gangs
+ = dims[GOMP_DIM_GANG];
+ enqueue_launch_event_info.launch_event.num_workers
+ = dims[GOMP_DIM_WORKER];
+ enqueue_launch_event_info.launch_event.vector_length
+ = dims[GOMP_DIM_VECTOR];
+
+ api_info->device_api = acc_device_api_cuda;
+
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
+ api_info);
+ }
+
CUDA_CALL_ASSERT (cuLaunchKernel, function,
dims[GOMP_DIM_GANG], 1, 1,
dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
0, stream, kargs, 0);
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_launch_end;
+ enqueue_launch_event_info.launch_event.event_type
+ = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
+ api_info);
+ }
+
GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
targ_fn->launch->fn);
}
@@ -878,6 +920,36 @@ nvptx_alloc (size_t s)
CUdeviceptr d;
CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
+
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ bool profiling_dispatch_p
+ = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+ if (profiling_dispatch_p)
+ {
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_event_info data_event_info;
+ acc_api_info *api_info = thr->api_info;
+
+ prof_info->event_type = acc_ev_alloc;
+
+ data_event_info.data_event.event_type = prof_info->event_type;
+ data_event_info.data_event.valid_bytes
+ = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+ data_event_info.data_event.parent_construct
+ = acc_construct_parallel;
+ data_event_info.data_event.implicit = 1;
+ data_event_info.data_event.tool_info = NULL;
+ data_event_info.data_event.var_name = NULL;
+ data_event_info.data_event.bytes = s;
+ data_event_info.data_event.host_ptr = NULL;
+ data_event_info.data_event.device_ptr = (void *) d;
+
+ api_info->device_api = acc_device_api_cuda;
+
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
+
return (void *) d;
}
@@ -1253,11 +1325,47 @@ openacc_exec_internal (void (*fn) (void *), int params, size_t mapnum,
/* Copy the (device) pointers to arguments to the device (dp and hp might in
fact have the same value on a unified-memory system). */
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_event_info data_event_info;
+ acc_api_info *api_info = thr->api_info;
+ bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_upload_start;
+
+ data_event_info.data_event.event_type = prof_info->event_type;
+ data_event_info.data_event.valid_bytes
+ = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+ data_event_info.data_event.parent_construct
+ = acc_construct_parallel;
+ /* Always implicit for "data mapping arguments for cuLaunchKernel". */
+ data_event_info.data_event.implicit = 1;
+ data_event_info.data_event.tool_info = NULL;
+ data_event_info.data_event.var_name = NULL;
+ data_event_info.data_event.bytes = mapnum * sizeof (void *);
+ data_event_info.data_event.host_ptr = hp;
+ if (!params)
+ data_event_info.data_event.device_ptr = (void *) dp;
+
+ api_info->device_api = acc_device_api_cuda;
+
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
if (!params && mapnum > 0)
CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
mapnum * sizeof (void *));
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_upload_end;
+ data_event_info.data_event.event_type = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
+
if (params)
nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
hp, NULL);
@@ -1338,6 +1446,34 @@ openacc_async_exec_internal (void (*fn) (void *), int params, size_t mapnum,
/* Copy the (device) pointers to arguments to the device (dp and hp might in
fact have the same value on a unified-memory system). */
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_event_info data_event_info;
+ acc_api_info *api_info = thr->api_info;
+ bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_upload_start;
+
+ data_event_info.data_event.event_type = prof_info->event_type;
+ data_event_info.data_event.valid_bytes
+ = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+ data_event_info.data_event.parent_construct
+ = acc_construct_parallel;
+ /* Always implicit for "data mapping arguments for cuLaunchKernel". */
+ data_event_info.data_event.implicit = 1;
+ data_event_info.data_event.tool_info = NULL;
+ data_event_info.data_event.var_name = NULL;
+ data_event_info.data_event.bytes = mapnum * sizeof (void *);
+ data_event_info.data_event.host_ptr = hp;
+ if (!params)
+ data_event_info.data_event.device_ptr = (void *) dp;
+
+ api_info->device_api = acc_device_api_cuda;
+
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
if (!params && mapnum > 0)
{
@@ -1350,6 +1486,14 @@ openacc_async_exec_internal (void (*fn) (void *), int params, size_t mapnum,
block[1] = (void *) nvthd->ptx_dev;
}
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_upload_end;
+ data_event_info.data_event.event_type = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
+
if (params)
nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
hp, aq->cuda_stream);
new file mode 100644
@@ -0,0 +1,350 @@
+/* Test dispatch of events to callbacks. */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+ which will cause the value at the point of call to be used (*before* any
+ potential modifications done in callbacks), as opposed to its address being
+ taken, which then later gets dereferenced (*after* any modifications done in
+ callbacks). */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+#define STATE_OP(state, op)\
+ do \
+ { \
+ typeof (state) state_o = (state); \
+ (void) state_o; \
+ (state)op; \
+ DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+ } \
+ while (0)
+
+
+void cb_compute_construct_start_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 0
+ || state == 10
+ || state == 30
+ || state == 41
+ || state == 51
+ || state == 91
+ || state == 101
+ || state == 151);
+ STATE_OP (state, ++);
+}
+
+void cb_compute_construct_start_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 1
+ || state == 11
+ || state == 40
+ || state == 50
+ || state == 90
+ || state == 100
+ || state == 150);
+ STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 14
+ || state == 21
+ || state == 32
+ || state == 42
+ || state == 80
+ || state == 103
+ || state == 152);
+ STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 13
+ || state == 43
+ || state == 102
+ || state == 154);
+ STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_3 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 12
+ || state == 20
+ || state == 31
+ || state == 44
+ || state == 81
+ || state == 104
+ || state == 153);
+ STATE_OP (state, ++);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ reg = reg_;
+ unreg = unreg_;
+ lookup = lookup_;
+}
+
+
+int main()
+{
+ STATE_OP (state, = 0);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 2);
+ }
+ assert (state == 2);
+
+ STATE_OP (state, = 10);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 12);
+ }
+ assert (state == 15);
+
+ STATE_OP (state, = 20);
+ unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
+ unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
+ unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+ unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 20);
+ }
+ assert (state == 20);
+
+ STATE_OP (state, = 30);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 31);
+ }
+ assert (state == 33);
+
+ STATE_OP (state, = 40);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+ unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 42);
+ }
+ assert (state == 45);
+
+ STATE_OP (state, = 50);
+ unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 52);
+ }
+ assert (state == 52);
+
+ STATE_OP (state, = 60);
+ unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+ unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+ unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 60);
+ }
+ assert (state == 60);
+
+ STATE_OP (state, = 70);
+ unreg (acc_ev_compute_construct_start, NULL, acc_toggle);
+ reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 70);
+ }
+ assert (state == 70);
+
+ STATE_OP (state, = 80);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+ reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+ reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 80);
+ }
+ assert (state == 82);
+
+ STATE_OP (state, = 90);
+ reg (acc_ev_compute_construct_start, NULL, acc_toggle);
+ unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 92);
+ }
+ assert (state == 92);
+
+ STATE_OP (state, = 100);
+ reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 102);
+ }
+ assert (state == 105);
+
+ STATE_OP (state, = 110);
+ unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+ unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 110);
+ }
+ assert (state == 110);
+
+ STATE_OP (state, = 120);
+ unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 120);
+ }
+ assert (state == 120);
+
+ STATE_OP (state, = 130);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+ reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 130);
+ }
+ assert (state == 130);
+
+ STATE_OP (state, = 140);
+ unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+ unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 140);
+ }
+ assert (state == 140);
+
+ STATE_OP (state, = 150);
+ reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+ {
+ int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ assert (state_init == 152);
+ }
+ assert (state == 155);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,5 @@
+/* Test "acc_prof-init-1.c" with debug info available. */
+
+/* { dg-additional-options "-g -DDEBUG_INFO=1" } */
+
+#include "acc_prof-init-1.c"
new file mode 100644
@@ -0,0 +1,388 @@
+/* Test dispatch of events to callbacks. */
+
+/* If not included from "acc_prof-parallel-1-debug_info.c". */
+#ifndef DEBUG_INFO
+# define DEBUG_INFO 0
+#endif
+
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+ which will cause the value at the point of call to be used (*before* any
+ potential modifications done in callbacks), as opposed to its address being
+ taken, which then later gets dereferenced (*after* any modifications done in
+ callbacks). */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+#define STATE_OP(state, op)\
+ do \
+ { \
+ typeof (state) state_o = (state); \
+ (void) state_o; \
+ (state)op; \
+ DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+ } \
+ while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+
+struct tool_info
+{
+ acc_event_info event_info;
+ struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+
+static const char *acc_prof_src_file = NULL;
+static const char *acc_prof_func_name = NULL;
+static int acc_prof_line_no = -1;
+
+static void
+set_locinfo (const char *src_file, const char *func_name, int line_no)
+{
+ assert (acc_prof_src_file == NULL);
+ acc_prof_src_file = src_file;
+ assert (acc_prof_func_name == NULL);
+ acc_prof_func_name = func_name;
+ assert (acc_prof_line_no == -1);
+ acc_prof_line_no = line_no;
+}
+
+static void
+unset_locinfo ()
+{
+ assert (acc_prof_src_file != NULL);
+ acc_prof_src_file = NULL;
+ assert (acc_prof_func_name != NULL);
+ acc_prof_func_name = NULL;
+ assert (acc_prof_line_no != -1);
+ acc_prof_line_no = -1;
+}
+
+static void
+verify_locinfo (const acc_prof_info *prof_info)
+{
+ DEBUG_printf (" acc_prof_src_file: '%s'\n", acc_prof_src_file ?: "NULL");
+ DEBUG_printf (" prof_info->src_file: '%s'\n", prof_info->src_file ?: "NULL");
+ DEBUG_printf (" acc_prof_func_name: '%s'\n", acc_prof_func_name ?: "NULL");
+ DEBUG_printf (" prof_info->func_name: '%s'\n", prof_info->func_name ?: "NULL");
+ DEBUG_printf (" acc_prof_line_no: '%d'\n", acc_prof_line_no);
+ DEBUG_printf (" prof_info->line_no: '%d'\n", prof_info->line_no);
+
+ assert (acc_prof_src_file != NULL);
+ assert (acc_prof_func_name != NULL);
+ assert (acc_prof_line_no != -1);
+#if DEBUG_INFO
+ assert (prof_info->src_file != NULL);
+ assert (strcmp (prof_info->src_file, acc_prof_src_file) == 0);
+ assert (prof_info->func_name != NULL);
+ assert (strcmp (prof_info->func_name, acc_prof_func_name) == 0);
+ assert (prof_info->line_no == acc_prof_line_no);
+#else
+ assert (prof_info->src_file == NULL);
+ assert (prof_info->func_name == NULL);
+ assert (prof_info->line_no == -1);
+#endif
+ assert (prof_info->end_line_no == -1);
+ assert (prof_info->func_line_no == -1);
+ assert (prof_info->func_end_line_no == -1);
+}
+
+
+void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 0
+ || state == 100);
+ STATE_OP (state, ++);
+
+ assert (tool_info == NULL);
+ tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+ assert (tool_info != NULL);
+ tool_info->nested = NULL;
+
+ assert (prof_info->event_type == acc_ev_device_init_start);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ if (state == 1)
+ assert (prof_info->device_type == acc_device_host);
+ else
+ assert (prof_info->device_type == acc_device_default);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async_sync);
+ assert (prof_info->async_queue == prof_info->async);
+#if DEBUG_INFO
+ //TODO verify_locinfo (prof_info);
+#else
+ verify_locinfo (prof_info);
+#endif
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
+ assert (event_info->other_event.implicit == 0);
+ assert (event_info->other_event.tool_info == NULL);
+
+ assert (api_info->device_api == acc_device_api_none);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+ event_info->other_event.tool_info = tool_info;
+}
+
+void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 1
+ || state == 101);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
+
+ assert (prof_info->event_type == acc_ev_device_init_end);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ if (state == 2)
+ assert (prof_info->device_type == acc_device_host);
+ else
+ assert (prof_info->device_type == acc_device_default);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async_sync);
+ assert (prof_info->async_queue == prof_info->async);
+#if DEBUG_INFO
+ //TODO verify_locinfo (prof_info);
+#else
+ verify_locinfo (prof_info);
+#endif
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
+ assert (event_info->other_event.implicit == 0);
+ assert (event_info->other_event.tool_info == tool_info);
+
+ assert (api_info->device_api == acc_device_api_none);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ free (tool_info);
+ tool_info = NULL;
+}
+
+void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 10
+ || state == 110);
+ STATE_OP (state, ++);
+
+ assert (tool_info == NULL);
+ tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+ assert (tool_info != NULL);
+ tool_info->nested = NULL;
+
+ assert (prof_info->event_type == acc_ev_compute_construct_start);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 0);
+ assert (event_info->other_event.tool_info == NULL);
+
+ assert (api_info->device_api == acc_device_api_none);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+ event_info->other_event.tool_info = tool_info;
+}
+
+void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 11
+ || state == 111);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested == NULL);
+
+ assert (prof_info->event_type == acc_ev_compute_construct_end);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ if (acc_device_type == acc_device_host)
+ assert (prof_info->async == acc_async_sync);
+ else
+ assert (prof_info->async == acc_async);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 0);
+ assert (event_info->other_event.tool_info == tool_info);
+
+ if (acc_device_type == acc_device_host)
+ assert (api_info->device_api == acc_device_api_none);
+ else
+ assert (api_info->device_api == acc_device_api_cuda);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ free (tool_info);
+ tool_info = NULL;
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ reg = reg_;
+ unreg = unreg_;
+ lookup = lookup_;
+}
+
+
+int main()
+{
+ STATE_OP (state, = 0);
+ reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+ reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+ assert (state == 0);
+
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_init (acc_device_host);
+ unset_locinfo ();
+ assert (state == 2);
+
+ STATE_OP (state, = 10);
+
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_device_type = acc_get_device_type ();
+ unset_locinfo ();
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_device_num = acc_get_device_num (acc_device_type);
+ unset_locinfo ();
+ acc_async = 12;
+
+ {
+ int state_init;
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ unset_locinfo ();
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc wait
+ unset_locinfo ();
+ assert (state_init == 11);
+ }
+ assert (state == 12);
+
+ STATE_OP (state, = 90);
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_shutdown (acc_device_host);
+ unset_locinfo ();
+ assert (state == 90);
+
+
+ STATE_OP (state, = 100);
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_init (acc_device_default);
+ unset_locinfo ();
+ assert (state == 102);
+
+ STATE_OP (state, = 110);
+
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_device_type = acc_get_device_type ();
+ unset_locinfo ();
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_device_num = acc_get_device_num (acc_device_type);
+ unset_locinfo ();
+ acc_async = 12;
+
+ {
+ int state_init;
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ unset_locinfo ();
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc wait
+ unset_locinfo ();
+ assert (state_init == 111);
+ }
+ assert (state == 112);
+
+ STATE_OP (state, = 190);
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_shutdown (acc_device_default);
+ unset_locinfo ();
+ assert (state == 190);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,252 @@
+/* Test dispatch of events to callbacks. */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+ which will cause the value at the point of call to be used (*before* any
+ potential modifications done in callbacks), as opposed to its address being
+ taken, which then later gets dereferenced (*after* any modifications done in
+ callbacks). */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+/* See the "DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT" reference in
+ libgomp.texi. */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+#define STATE_OP(state, op)\
+ do \
+ { \
+ typeof (state) state_o = (state); \
+ (void) state_o; \
+ (state)op; \
+ DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+ } \
+ while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int num_gangs, num_workers, vector_length;
+
+
+void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (acc_device_type != acc_device_host);
+
+ assert (state == 0);
+ STATE_OP (state, = 1);
+
+ assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async_sync);
+ assert (prof_info->async_queue == prof_info->async);
+ assert (prof_info->src_file == NULL);
+ assert (prof_info->func_name == NULL);
+ assert (prof_info->line_no == -1);
+ assert (prof_info->end_line_no == -1);
+ assert (prof_info->func_line_no == -1);
+ assert (prof_info->func_end_line_no == -1);
+
+ assert (event_info->launch_event.event_type == prof_info->event_type);
+ assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+ assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+ assert (event_info->launch_event.implicit == 1);
+ assert (event_info->launch_event.tool_info == NULL);
+ assert (event_info->launch_event.kernel_name != NULL);
+ {
+ const char *s = strstr (event_info->launch_event.kernel_name, "main");
+ assert (s != NULL);
+ s = strstr (s, "omp_fn");
+ assert (s != NULL);
+ }
+ if (num_gangs < 1)
+ assert (event_info->launch_event.num_gangs >= 1);
+ else
+ {
+#ifdef __OPTIMIZE__
+ assert (event_info->launch_event.num_gangs == num_gangs);
+#else
+ /* No parallelized OpenACC kernels constructs, and unparallelized OpenACC
+ kernels constructs must get launched as 1 x 1 x 1 kernels. */
+ assert (event_info->launch_event.num_gangs == 1);
+#endif
+ }
+ if (num_workers < 1)
+ assert (event_info->launch_event.num_workers >= 1);
+ else
+ {
+#ifdef __OPTIMIZE__
+ assert (event_info->launch_event.num_workers == num_workers);
+#else
+ /* See num_gangs above. */
+ assert (event_info->launch_event.num_workers == 1);
+#endif
+ }
+ if (vector_length < 1)
+ assert (event_info->launch_event.vector_length >= 1);
+ else if (acc_device_type == acc_device_nvidia) /* ... is special. */
+ assert (event_info->launch_event.vector_length == 32);
+ else
+ {
+#ifdef __OPTIMIZE__
+ assert (event_info->launch_event.vector_length == vector_length);
+#else
+ /* See num_gangs above. */
+ assert (event_info->launch_event.vector_length == 1);
+#endif
+ }
+
+ if (acc_device_type == acc_device_host)
+ assert (api_info->device_api == acc_device_api_none);
+ else
+ assert (api_info->device_api == acc_device_api_cuda);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ reg = reg_;
+ unreg = unreg_;
+ lookup = lookup_;
+}
+
+
+int main()
+{
+ STATE_OP (state, = 0);
+ reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+ assert (state == 0);
+
+ acc_device_type = acc_get_device_type ();
+ acc_device_num = acc_get_device_num (acc_device_type);
+ assert (state == 0);
+
+ /* Parallelism dimensions: compiler/runtime decides. */
+ STATE_OP (state, = 0);
+ num_gangs = num_workers = vector_length = 0;
+ {
+#define N 100
+ int x[N];
+#pragma acc kernels
+ {
+ for (int i = 0; i < N; ++i)
+ x[i] = i * i;
+ }
+#ifdef __OPTIMIZE__
+ /* TODO. With -O2 optimizations enabled, the compiler believes that here
+ "state == 0" still holds. It's not yet clear what's going on.
+ Mis-optimization across the GOMP function call boundary? Per its
+ gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+ "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+ must expect calls back into this compilation unit? */
+ asm volatile ("" : : : "memory");
+#endif
+ if (acc_device_type == acc_device_host)
+ assert (state == 0); /* No acc_ev_enqueue_launch_start. */
+ else
+ assert (state == 1);
+ for (int i = 0; i < N; ++i)
+ if (x[i] != i * i)
+ __builtin_abort ();
+#undef N
+ }
+
+ /* Parallelism dimensions: literal. */
+ STATE_OP (state, = 0);
+ num_gangs = 30;
+ num_workers = 3;
+ vector_length = 5;
+ {
+#define N 100
+ int x[N];
+#pragma acc kernels \
+ num_gangs (30) num_workers (3) vector_length (5)
+ /* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
+ {
+ for (int i = 0; i < N; ++i)
+ x[i] = i * i;
+ }
+#ifdef __OPTIMIZE__
+ /* TODO. With -O2 optimizations enabled, the compiler believes that here
+ "state == 0" still holds. It's not yet clear what's going on.
+ Mis-optimization across the GOMP function call boundary? Per its
+ gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+ "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+ must expect calls back into this compilation unit? */
+ asm volatile ("" : : : "memory");
+#endif
+ if (acc_device_type == acc_device_host)
+ assert (state == 0); /* No acc_ev_enqueue_launch_start. */
+ else
+ assert (state == 1);
+ for (int i = 0; i < N; ++i)
+ if (x[i] != i * i)
+ __builtin_abort ();
+#undef N
+ }
+
+ /* Parallelism dimensions: variable. */
+ STATE_OP (state, = 0);
+ num_gangs = 22;
+ num_workers = 5;
+ vector_length = 7;
+ {
+#define N 100
+ int x[N];
+#pragma acc kernels \
+ num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
+ /* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
+ {
+ for (int i = 0; i < N; ++i)
+ x[i] = i * i;
+ }
+#ifdef __OPTIMIZE__
+ /* TODO. With -O2 optimizations enabled, the compiler believes that here
+ "state == 0" still holds. It's not yet clear what's going on.
+ Mis-optimization across the GOMP function call boundary? Per its
+ gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+ "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+ must expect calls back into this compilation unit? */
+ asm volatile ("" : : : "memory");
+#endif
+ if (acc_device_type == acc_device_host)
+ assert (state == 0); /* No acc_ev_enqueue_launch_start. */
+ else
+ assert (state == 1);
+ for (int i = 0; i < N; ++i)
+ if (x[i] != i * i)
+ __builtin_abort ();
+#undef N
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,5 @@
+/* Test "acc_prof-parallel-1.c" with debug info available. */
+
+/* { dg-additional-options "-g -DDEBUG_INFO=1" } */
+
+#include "acc_prof-parallel-1.c"
new file mode 100644
@@ -0,0 +1,737 @@
+/* Test dispatch of events to callbacks. */
+
+/* If not included from "acc_prof-parallel-1-debug_info.c". */
+#ifndef DEBUG_INFO
+# define DEBUG_INFO 0
+#endif
+
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+ which will cause the value at the point of call to be used (*before* any
+ potential modifications done in callbacks), as opposed to its address being
+ taken, which then later gets dereferenced (*after* any modifications done in
+ callbacks). */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+/* See the "DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT" reference in
+ libgomp.texi. */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+#define STATE_OP(state, op)\
+ do \
+ { \
+ typeof (state) state_o = (state); \
+ (void) state_o; \
+ (state)op; \
+ DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+ } \
+ while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+
+struct tool_info
+{
+ acc_event_info event_info;
+ struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+
+static const char *acc_prof_src_file = NULL;
+static const char *acc_prof_func_name = NULL;
+static int acc_prof_line_no = -1;
+
+static void
+set_locinfo (const char *src_file, const char *func_name, int line_no)
+{
+ assert (acc_prof_src_file == NULL);
+ acc_prof_src_file = src_file;
+ assert (acc_prof_func_name == NULL);
+ acc_prof_func_name = func_name;
+ assert (acc_prof_line_no == -1);
+ acc_prof_line_no = line_no;
+}
+
+static void
+unset_locinfo ()
+{
+ assert (acc_prof_src_file != NULL);
+ acc_prof_src_file = NULL;
+ assert (acc_prof_func_name != NULL);
+ acc_prof_func_name = NULL;
+ assert (acc_prof_line_no != -1);
+ acc_prof_line_no = -1;
+}
+
+static void
+verify_locinfo (const acc_prof_info *prof_info)
+{
+ DEBUG_printf (" acc_prof_src_file: '%s'\n", acc_prof_src_file ?: "NULL");
+ DEBUG_printf (" prof_info->src_file: '%s'\n", prof_info->src_file ?: "NULL");
+ DEBUG_printf (" acc_prof_func_name: '%s'\n", acc_prof_func_name ?: "NULL");
+ DEBUG_printf (" prof_info->func_name: '%s'\n", prof_info->func_name ?: "NULL");
+ DEBUG_printf (" acc_prof_line_no: '%d'\n", acc_prof_line_no);
+ DEBUG_printf (" prof_info->line_no: '%d'\n", prof_info->line_no);
+
+ assert (acc_prof_src_file != NULL);
+ assert (acc_prof_func_name != NULL);
+ assert (acc_prof_line_no != -1);
+#if DEBUG_INFO
+ assert (prof_info->src_file != NULL);
+ assert (strcmp (prof_info->src_file, acc_prof_src_file) == 0);
+ assert (prof_info->func_name != NULL);
+ assert (strcmp (prof_info->func_name, acc_prof_func_name) == 0);
+ assert (prof_info->line_no == acc_prof_line_no);
+#else
+ assert (prof_info->src_file == NULL);
+ assert (prof_info->func_name == NULL);
+ assert (prof_info->line_no == -1);
+#endif
+ assert (prof_info->end_line_no == -1);
+ assert (prof_info->func_line_no == -1);
+ assert (prof_info->func_end_line_no == -1);
+}
+
+
+void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+ assert (state == 1
+ || state == 101);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested == NULL);
+ tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+ assert (tool_info->nested != NULL);
+ tool_info->nested->nested = NULL;
+#else
+ assert (state == 0
+ || state == 100);
+ STATE_OP (state, ++);
+
+ assert (tool_info == NULL);
+ tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+ assert (tool_info != NULL);
+ tool_info->nested = NULL;
+#endif
+
+ assert (prof_info->event_type == acc_ev_device_init_start);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_default);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async_sync);
+ assert (prof_info->async_queue == prof_info->async);
+#if DEBUG_INFO
+ //TODO verify_locinfo (prof_info);
+#else
+ verify_locinfo (prof_info);
+#endif
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 1);
+ assert (event_info->other_event.tool_info == NULL);
+
+ assert (api_info->device_api == acc_device_api_none);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+ tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+ event_info->other_event.tool_info = tool_info->nested;
+#else
+ tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+ event_info->other_event.tool_info = tool_info;
+#endif
+}
+
+void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+ assert (state == 2
+ || state == 102);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested != NULL);
+ assert (tool_info->nested->event_info.other_event.event_type == acc_ev_device_init_start);
+#else
+ assert (state == 1
+ || state == 101);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
+#endif
+
+ assert (prof_info->event_type == acc_ev_device_init_end);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_default);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async_sync);
+ assert (prof_info->async_queue == prof_info->async);
+#if DEBUG_INFO
+ //TODO verify_locinfo (prof_info);
+#else
+ verify_locinfo (prof_info);
+#endif
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 1);
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+ assert (event_info->other_event.tool_info == tool_info->nested);
+#else
+ assert (event_info->other_event.tool_info == tool_info);
+#endif
+
+ assert (api_info->device_api == acc_device_api_none);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+ free (tool_info->nested);
+ tool_info->nested = NULL;
+#else
+ free (tool_info);
+ tool_info = NULL;
+#endif
+}
+
+void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 3
+ || state == 103);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested == NULL);
+ tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+ assert (tool_info->nested != NULL);
+ tool_info->nested->nested = NULL;
+
+ assert (prof_info->event_type == acc_ev_enter_data_start);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 1);
+ assert (event_info->other_event.tool_info == NULL);
+
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+ event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 4
+ || state == 104);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested != NULL);
+ assert (tool_info->nested->event_info.other_event.event_type == acc_ev_enter_data_start);
+
+ assert (prof_info->event_type == acc_ev_enter_data_end);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 1);
+ assert (event_info->other_event.tool_info == tool_info->nested);
+
+ if (acc_device_type == acc_device_host)
+ assert (api_info->device_api == acc_device_api_none);
+ else
+ assert (api_info->device_api == acc_device_api_cuda);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ free (tool_info->nested);
+ tool_info->nested = NULL;
+}
+
+void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 7);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested == NULL);
+ tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+ assert (tool_info->nested != NULL);
+ tool_info->nested->nested = NULL;
+
+ assert (prof_info->event_type == acc_ev_exit_data_start);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 1);
+ assert (event_info->other_event.tool_info == NULL);
+
+ if (acc_device_type == acc_device_host)
+ assert (api_info->device_api == acc_device_api_none);
+ else
+ assert (api_info->device_api == acc_device_api_cuda);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+ event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (state == 8);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested != NULL);
+ assert (tool_info->nested->event_info.other_event.event_type == acc_ev_exit_data_start);
+
+ assert (prof_info->event_type == acc_ev_exit_data_end);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 1);
+ assert (event_info->other_event.tool_info == tool_info->nested);
+
+ if (acc_device_type == acc_device_host)
+ assert (api_info->device_api == acc_device_api_none);
+ else
+ assert (api_info->device_api == acc_device_api_cuda);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ free (tool_info->nested);
+ tool_info->nested = NULL;
+}
+
+void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+ assert (state == 0
+ || state == 100);
+ if (state == 100)
+ {
+ /* Compensate for the missing acc_ev_device_init_start and
+ acc_ev_device_init_end. */
+ state += 2;
+ }
+#else
+ if (state == 100)
+ {
+ /* Compensate for the missing acc_ev_device_init_start and
+ acc_ev_device_init_end. */
+ state += 2;
+ }
+ assert (state == 2
+ || state == 102);
+#endif
+ STATE_OP (state, ++);
+
+ assert (tool_info == NULL);
+ tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+ assert (tool_info != NULL);
+ tool_info->nested = NULL;
+
+ assert (prof_info->event_type == acc_ev_compute_construct_start);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 0);
+ assert (event_info->other_event.tool_info == NULL);
+
+ assert (api_info->device_api == acc_device_api_none);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+ event_info->other_event.tool_info = tool_info;
+
+ if (acc_device_type == acc_device_host)
+ {
+ /* Compensate for the missing acc_ev_enter_data_start. */
+ state += 1;
+ }
+}
+
+void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ if (acc_device_type == acc_device_host)
+ {
+ /* Compensate for the missing acc_ev_enter_data_end. */
+ state += 1;
+ /* Compensate for the missing acc_ev_enqueue_launch_start and
+ acc_ev_enqueue_launch_end. */
+ state += 2;
+ /* Compensate for the missing acc_ev_exit_data_start and
+ acc_ev_exit_data_end. */
+ state += 2;
+ }
+ else if (acc_async != acc_async_sync)
+ {
+ /* Compensate for the missing acc_ev_exit_data_start and
+ acc_ev_exit_data_end. */
+ state += 2;
+ }
+ assert (state == 9
+ || state == 109);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested == NULL);
+
+ assert (prof_info->event_type == acc_ev_compute_construct_end);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ if (acc_device_type == acc_device_host)
+ assert (prof_info->async == acc_async_sync);
+ else
+ assert (prof_info->async == acc_async);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->other_event.event_type == prof_info->event_type);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (event_info->other_event.parent_construct == acc_construct_parallel);
+ assert (event_info->other_event.implicit == 0);
+ assert (event_info->other_event.tool_info == tool_info);
+
+ if (acc_device_type == acc_device_host)
+ assert (api_info->device_api == acc_device_api_none);
+ else
+ assert (api_info->device_api == acc_device_api_cuda);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ free (tool_info);
+ tool_info = NULL;
+}
+
+void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (acc_device_type != acc_device_host);
+
+ assert (state == 5
+ || state == 105);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested == NULL);
+ tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+ assert (tool_info->nested != NULL);
+ tool_info->nested->nested = NULL;
+
+ assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->launch_event.event_type == prof_info->event_type);
+ assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+ assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+ assert (event_info->launch_event.implicit == 1);
+ assert (event_info->launch_event.tool_info == NULL);
+ assert (event_info->launch_event.kernel_name != NULL);
+ {
+ const char *s = strstr (event_info->launch_event.kernel_name, "main");
+ assert (s != NULL);
+ s = strstr (s, "omp_fn");
+ assert (s != NULL);
+ }
+ assert (event_info->launch_event.num_gangs >= 1);
+ assert (event_info->launch_event.num_workers >= 1);
+ assert (event_info->launch_event.vector_length >= 1);
+
+ if (acc_device_type == acc_device_host)
+ assert (api_info->device_api == acc_device_api_none);
+ else
+ assert (api_info->device_api == acc_device_api_cuda);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type;
+ tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name);
+ tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs;
+ tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers;
+ tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length;
+ event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ assert (acc_device_type != acc_device_host);
+
+ assert (state == 6
+ || state == 106);
+ STATE_OP (state, ++);
+
+ assert (tool_info != NULL);
+ assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+ assert (tool_info->nested != NULL);
+ assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start);
+ assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
+ assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
+ assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
+ assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
+
+ assert (prof_info->event_type == acc_ev_enqueue_launch_end);
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+ assert (prof_info->device_type == acc_device_type);
+ assert (prof_info->device_number == acc_device_num);
+ assert (prof_info->thread_id == -1);
+ assert (prof_info->async == acc_async);
+ assert (prof_info->async_queue == prof_info->async);
+ verify_locinfo (prof_info);
+
+ assert (event_info->launch_event.event_type == prof_info->event_type);
+ assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+ assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+ assert (event_info->launch_event.implicit == 1);
+ assert (event_info->launch_event.tool_info == tool_info->nested);
+ assert (event_info->launch_event.kernel_name != NULL);
+ assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0);
+ assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs);
+ assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers);
+ assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length);
+
+ if (acc_device_type == acc_device_host)
+ assert (api_info->device_api == acc_device_api_none);
+ else
+ assert (api_info->device_api == acc_device_api_cuda);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+ assert (api_info->device_type == prof_info->device_type);
+ assert (api_info->vendor == -1);
+ assert (api_info->device_handle == NULL);
+ assert (api_info->context_handle == NULL);
+ assert (api_info->async_handle == NULL);
+
+ free ((void *) tool_info->nested->event_info.launch_event.kernel_name);
+ free (tool_info->nested);
+ tool_info->nested = NULL;
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ reg = reg_;
+ unreg = unreg_;
+ lookup = lookup_;
+}
+
+
+int main()
+{
+ STATE_OP (state, = 0);
+ reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+ reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+ reg (acc_ev_enter_data_start, cb_enter_data_start, acc_reg);
+ reg (acc_ev_enter_data_end, cb_enter_data_end, acc_reg);
+ reg (acc_ev_exit_data_start, cb_exit_data_start, acc_reg);
+ reg (acc_ev_exit_data_end, cb_exit_data_end, acc_reg);
+ reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+ reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+ reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+ reg (acc_ev_enqueue_launch_end, cb_enqueue_launch_end, acc_reg);
+ assert (state == 0);
+
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_device_type = acc_get_device_type ();
+ unset_locinfo ();
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+ acc_device_num = acc_get_device_num (acc_device_type);
+ unset_locinfo ();
+ acc_async = acc_async_sync;
+ assert (state == 0);
+
+ {
+ int state_init;
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc parallel COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ unset_locinfo ();
+ assert (state_init == 4);
+ }
+#ifdef __OPTIMIZE__
+ /* TODO. With -O2 optimizations enabled, the compiler believes that here
+ "state == 0" still holds. It's not yet clear what's going on.
+ Mis-optimization across the GOMP function call boundary? Per its
+ gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+ "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+ must expect calls back into this compilation unit? */
+ asm volatile ("" : : : "memory");
+#endif
+ assert (state == 10);
+
+ STATE_OP (state, = 100);
+
+ acc_async = 12;
+ {
+ int state_init;
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+ {
+ state_init = state;
+ }
+ unset_locinfo ();
+ set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc wait
+ unset_locinfo ();
+ assert (state_init == 104);
+ }
+ assert (state == 110);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,178 @@
+/* Test the "valid_bytes" magic. */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+ assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+ assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+ assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
+ reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
+ reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
+ reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
+ reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
+ reg_ (acc_ev_create, cb_data_event, acc_reg);
+ reg_ (acc_ev_delete, cb_data_event, acc_reg);
+ reg_ (acc_ev_alloc, cb_data_event, acc_reg);
+ reg_ (acc_ev_free, cb_data_event, acc_reg);
+ reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
+ reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
+ reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
+ reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
+ reg_ (acc_ev_update_start, cb_other_event, acc_reg);
+ reg_ (acc_ev_update_end, cb_other_event, acc_reg);
+ reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
+ reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
+ reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
+ reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
+ reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
+ reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
+ reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
+ reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
+ reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
+ reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
+}
+
+
+/* Basic struct. */
+typedef struct A
+{
+ int a;
+ int b;
+#define VALID_BYTES_A \
+ _ACC_PROF_VALID_BYTES_STRUCT (A, b, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (int))
+} A;
+
+/* Add a "char" field. */
+typedef struct B
+{
+ int a;
+ int b;
+ char c;
+#define VALID_BYTES_B \
+ _ACC_PROF_VALID_BYTES_STRUCT (B, c, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} B;
+
+/* Add another "char" field. */
+typedef struct C
+{
+ int a;
+ int b;
+ char c, d;
+#define VALID_BYTES_C \
+ _ACC_PROF_VALID_BYTES_STRUCT (C, d, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} C;
+
+/* Add two "void *" fields. */
+typedef struct D
+{
+ int a;
+ int b;
+ char c, d;
+ void *e;
+ void *f;
+#define VALID_BYTES_D \
+ _ACC_PROF_VALID_BYTES_STRUCT (D, f, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} D;
+
+/* Add another three "char" fields. */
+typedef struct E
+{
+ int a;
+ int b;
+ char c, d;
+ void *e;
+ void *f;
+ char g, h, i;
+#define VALID_BYTES_E \
+ _ACC_PROF_VALID_BYTES_STRUCT (E, i, \
+ _ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} E;
+
+
+int main()
+{
+ A A1;
+ DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
+ assert (VALID_BYTES_A <= sizeof A1);
+ DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
+ assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
+
+ B B1;
+ DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
+ assert (VALID_BYTES_B <= sizeof B1);
+ DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
+ assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
+
+ assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
+
+ C C1;
+ DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
+ assert (VALID_BYTES_C <= sizeof C1);
+ DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
+ assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
+
+ assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
+
+ D D1;
+ DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
+ assert (VALID_BYTES_D <= sizeof D1);
+ DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
+ assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
+
+ assert (VALID_BYTES_D > VALID_BYTES_C);
+
+ E E1;
+ DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
+ assert (VALID_BYTES_E <= sizeof E1);
+ DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
+ assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
+
+ assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
+
+ /* Trigger tests done in cb_* functions. */
+#pragma acc parallel
+ {
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,60 @@
+/* Test "acc_prof_info"'s "version" field. */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+ assert (prof_info->version == 201510);
+}
+
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+ DEBUG_printf ("%s\n", __FUNCTION__);
+
+ reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
+ reg_ (acc_ev_create, cb_any_event, acc_reg);
+ reg_ (acc_ev_delete, cb_any_event, acc_reg);
+ reg_ (acc_ev_alloc, cb_any_event, acc_reg);
+ reg_ (acc_ev_free, cb_any_event, acc_reg);
+ reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_update_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_update_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
+ reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
+ reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
+}
+
+
+int main()
+{
+ /* Trigger tests done in cb_* functions. */
+#pragma acc parallel
+ {
+ }
+
+ return 0;
+}