Message ID | 20131119095829.GA19301@msticlxl7.ims.intel.com |
---|---|
State | New |
Headers | show |
On Tue, Nov 19, 2013 at 10:58 AM, Ilya Tocar <tocarip.intel@gmail.com> wrote: > On 14 Nov 11:27, Richard Biener wrote: >> > + /* Set when symbol needs to be dumped for lto/offloading. */ >> > + unsigned need_dump : 1; >> > + >> >> That's very non-descriptive. What's "offloading"? But yes, something >> like this is what I was asking for. > > I've changed it into: > Set when symbol needs to be dumped into LTO bytecode for LTO, > or in pragma omp target case, for separate compilation targeting > a different architecture. > > Ok for gomp4 branch now? Works for me. I'll let branch maintainers decide if it follows whatever is done there (I haven't found time to follow stuff here). Richard. > 2013-11-19 Ilya Tocar <ilya.tocar@intel.com> > > * cgraph.h (symtab_node): Add need_dump. > * cgraphunit.c (ipa_passes): Run ipa_write_summaries for omp. > (compile): Intialize streamer for omp. > * ipa-inline-analysis.c (inline_generate_summary): Add flag_openmp. > * lto-cgraph.c (lto_set_symtab_encoder_in_partition): Respect > need_dump flag. > (select_what_to_dump): New. > * lto-streamer.c (section_name_prefix): New. > (lto_get_section_name): Use section_name_prefix. > (lto_streamer_init): Add flag_openmp. > * lto-streamer.h (OMP_SECTION_NAME_PREFIX): New. > (section_name_prefix): Ditto. > (select_what_to_dump): Ditto. > * lto/lto-partition.c (add_symbol_to_partition_1): Set need_dump. > (lto_promote_cross_file_statics): Dump everyhtinh. > * passes.c (ipa_write_summaries): Add parameter, > call select_what_to_dump. > * tree-pass.h (void ipa_write_summaries): Add parameter. > > > --- > gcc/cgraph.h | 5 +++++ > gcc/cgraphunit.c | 15 +++++++++++++-- > gcc/ipa-inline-analysis.c | 2 +- > gcc/lto-cgraph.c | 14 ++++++++++++++ > gcc/lto-streamer.c | 5 +++-- > gcc/lto-streamer.h | 6 ++++++ > gcc/lto/lto-partition.c | 3 +++ > gcc/passes.c | 6 ++++-- > gcc/tree-pass.h | 2 +- > 9 files changed, 50 insertions(+), 8 deletions(-) > > diff --git a/gcc/cgraph.h b/gcc/cgraph.h > index fb0fe93..9f799f4 100644 > --- a/gcc/cgraph.h > +++ b/gcc/cgraph.h > @@ -105,6 +105,11 @@ public: > /* Set when symbol has address taken. */ > unsigned address_taken : 1; > > + /* Set when symbol needs to be dumped into LTO bytecode for LTO, > + or in pragma omp target case, for separate compilation targeting > + a different architecture. */ > + unsigned need_dump : 1; > + > > /* Ordering of all symtab entries. */ > int order; > diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c > index c3a8967..53cd250 100644 > --- a/gcc/cgraphunit.c > +++ b/gcc/cgraphunit.c > @@ -2019,7 +2019,18 @@ ipa_passes (void) > passes->all_lto_gen_passes); > > if (!in_lto_p) > - ipa_write_summaries (); > + { > + if (flag_openmp) > + { > + section_name_prefix = OMP_SECTION_NAME_PREFIX; > + ipa_write_summaries (true); > + } > + if (flag_lto) > + { > + section_name_prefix = LTO_SECTION_NAME_PREFIX; > + ipa_write_summaries (false); > + } > + } > > if (flag_generate_lto) > targetm.asm_out.lto_end (); > @@ -2110,7 +2121,7 @@ compile (void) > cgraph_state = CGRAPH_STATE_IPA; > > /* If LTO is enabled, initialize the streamer hooks needed by GIMPLE. */ > - if (flag_lto) > + if (flag_lto || flag_openmp) > lto_streamer_hooks_init (); > > /* Don't run the IPA passes if there was any error or sorry messages. */ > diff --git a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c > index 4458723..62faa52 100644 > --- a/gcc/ipa-inline-analysis.c > +++ b/gcc/ipa-inline-analysis.c > @@ -3813,7 +3813,7 @@ inline_generate_summary (void) > > /* When not optimizing, do not bother to analyze. Inlining is still done > because edge redirection needs to happen there. */ > - if (!optimize && !flag_lto && !flag_wpa) > + if (!optimize && !flag_lto && !flag_wpa && !flag_openmp) > return; > > function_insertion_hook_holder = > diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c > index 6a52da8..697c069 100644 > --- a/gcc/lto-cgraph.c > +++ b/gcc/lto-cgraph.c > @@ -238,6 +238,9 @@ void > lto_set_symtab_encoder_in_partition (lto_symtab_encoder_t encoder, > symtab_node *node) > { > + /* Ignore not needed nodes. */ > + if (!node->need_dump) > + return; > int index = lto_symtab_encoder_encode (encoder, node); > encoder->nodes[index].in_partition = true; > } > @@ -751,6 +754,17 @@ add_references (lto_symtab_encoder_t encoder, > lto_symtab_encoder_encode (encoder, ref->referred); > } > > +/* Select what needs to be dumped. In lto case dump everything. > + In omp target case only dump stuff makrked with attribute. */ > +void > +select_what_to_dump (bool is_omp) > +{ > + struct symtab_node *snode; > + FOR_EACH_SYMBOL(snode) > + snode->need_dump = !is_omp || lookup_attribute ("omp declare target", > + DECL_ATTRIBUTES (snode->decl)); > +} > + > /* Find all symbols we want to stream into given partition and insert them > to encoders. > > diff --git a/gcc/lto-streamer.c b/gcc/lto-streamer.c > index 1540e4c..ffafb0e 100644 > --- a/gcc/lto-streamer.c > +++ b/gcc/lto-streamer.c > @@ -43,6 +43,7 @@ struct lto_stats_d lto_stats; > static bitmap_obstack lto_obstack; > static bool lto_obstack_initialized; > > +const char *section_name_prefix = LTO_SECTION_NAME_PREFIX; > > /* Return a string representing LTO tag TAG. */ > > @@ -172,7 +173,7 @@ lto_get_section_name (int section_type, const char *name, struct lto_file_decl_d > sprintf (post, "." HOST_WIDE_INT_PRINT_HEX_PURE, f->id); > else > sprintf (post, "." HOST_WIDE_INT_PRINT_HEX_PURE, get_random_seed (false)); > - return concat (LTO_SECTION_NAME_PREFIX, sep, add, post, NULL); > + return concat (section_name_prefix, sep, add, post, NULL); > } > > > @@ -310,7 +311,7 @@ lto_streamer_init (void) > bool > gate_lto_out (void) > { > - return ((flag_generate_lto || in_lto_p) > + return ((flag_generate_lto || in_lto_p || flag_openmp) > /* Don't bother doing anything if the program has errors. */ > && !seen_error ()); > } > diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h > index 797e92e..f4c46db 100644 > --- a/gcc/lto-streamer.h > +++ b/gcc/lto-streamer.h > @@ -139,6 +139,11 @@ along with GCC; see the file COPYING3. If not see > name for the functions and static_initializers. For other types of > sections a '.' and the section type are appended. */ > #define LTO_SECTION_NAME_PREFIX ".gnu.lto_" > +#define OMP_SECTION_NAME_PREFIX ".gnu.target_lto_" > + > +/* Can be either OMP_SECTION_NAME_PREFIX when we stream pragma omp target > + stuff, or LTO_SECTION_NAME_PREFIX for lto case. */ > +extern const char *section_name_prefix; > > #define LTO_major_version 2 > #define LTO_minor_version 2 > @@ -895,6 +900,7 @@ bool referenced_from_this_partition_p (struct ipa_ref_list *, > bool reachable_from_this_partition_p (struct cgraph_node *, > lto_symtab_encoder_t); > lto_symtab_encoder_t compute_ltrans_boundary (lto_symtab_encoder_t encoder); > +void select_what_to_dump (bool); > > > /* In lto-symtab.c. */ > diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c > index 6a3d881..2d2aa63 100644 > --- a/gcc/lto/lto-partition.c > +++ b/gcc/lto/lto-partition.c > @@ -190,6 +190,7 @@ add_symbol_to_partition_1 (ltrans_partition part, symtab_node *node) > gcc_assert (c != SYMBOL_EXTERNAL > && (c == SYMBOL_DUPLICATE || !symbol_partitioned_p (node))); > > + node->need_dump = true; > lto_set_symtab_encoder_in_partition (part->encoder, node); > > if (symbol_partitioned_p (node)) > @@ -917,6 +918,8 @@ lto_promote_cross_file_statics (void) > > gcc_assert (flag_wpa); > > + select_what_to_dump (false); > + > /* First compute boundaries. */ > n_sets = ltrans_partitions.length (); > for (i = 0; i < n_sets; i++) > diff --git a/gcc/passes.c b/gcc/passes.c > index 19e5869..88b1538 100644 > --- a/gcc/passes.c > +++ b/gcc/passes.c > @@ -2335,7 +2335,7 @@ ipa_write_summaries_1 (lto_symtab_encoder_t encoder) > /* Write out summaries for all the nodes in the callgraph. */ > > void > -ipa_write_summaries (void) > +ipa_write_summaries (bool is_omp) > { > lto_symtab_encoder_t encoder; > int i, order_pos; > @@ -2343,9 +2343,11 @@ ipa_write_summaries (void) > struct cgraph_node *node; > struct cgraph_node **order; > > - if (!flag_generate_lto || seen_error ()) > + if (!(flag_generate_lto || flag_openmp) || seen_error () ) > return; > > + select_what_to_dump (is_omp); > + > encoder = lto_symtab_encoder_new (false); > > /* Create the callgraph set in the same order used in > diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h > index fa403c7..8d51d80 100644 > --- a/gcc/tree-pass.h > +++ b/gcc/tree-pass.h > @@ -595,7 +595,7 @@ extern void pass_fini_dump_file (struct opt_pass *); > extern const char *get_current_pass_name (void); > extern void print_current_pass (FILE *); > extern void debug_pass (void); > -extern void ipa_write_summaries (void); > +extern void ipa_write_summaries (bool is_omp); > extern void ipa_write_optimization_summaries (struct lto_symtab_encoder_d *); > extern void ipa_read_summaries (void); > extern void ipa_read_optimization_summaries (void); > -- > 1.8.3.1 >
On Wed, Nov 20, 2013 at 10:34:30AM +0100, Richard Biener wrote: > On Tue, Nov 19, 2013 at 10:58 AM, Ilya Tocar <tocarip.intel@gmail.com> wrote: > > On 14 Nov 11:27, Richard Biener wrote: > >> > + /* Set when symbol needs to be dumped for lto/offloading. */ > >> > + unsigned need_dump : 1; > >> > + > >> > >> That's very non-descriptive. What's "offloading"? But yes, something > >> like this is what I was asking for. > > > > I've changed it into: > > Set when symbol needs to be dumped into LTO bytecode for LTO, > > or in pragma omp target case, for separate compilation targeting > > a different architecture. > > > > Ok for gomp4 branch now? > > Works for me. I'll let branch maintainers decide if it follows whatever > is done there (I haven't found time to follow stuff here). Ok then. Jakub
Hi! Just some suggestion related to terminology. On Tue, 19 Nov 2013 13:58:29 +0400, Ilya Tocar <tocarip.intel@gmail.com> wrote: > On 14 Nov 11:27, Richard Biener wrote: > > > + /* Set when symbol needs to be dumped for lto/offloading. */ > > > + unsigned need_dump : 1; > > > + > > > > That's very non-descriptive. What's "offloading"? But yes, something > > like this is what I was asking for. > > I've changed it into: > Set when symbol needs to be dumped into LTO bytecode for LTO, > or in pragma omp target case, for separate compilation targeting > a different architecture. Can we in fact agree to use the term "offload" to mean exactly that? We'll need this in other contexts, too, such as for configuring the "secondary" lto1 (which is, in fact, the one that will be processing the main GCC's "offloaded" code)? I'm happy to go looking for a proper section in GCC's (internal?) manual to document what "offloading" means in GCC's context, and I'm likewise happy to hear if there's any better term existing for describing basically the process of »separate compilation targeting a different architecture«? (I'm not a native speaker.) By the way, in this context, I like saing "offloading" better than "acceleration", because while we strive for acceleration, offloading is what we technically do. > diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c > index c3a8967..53cd250 100644 > --- a/gcc/cgraphunit.c > +++ b/gcc/cgraphunit.c > @@ -2019,7 +2019,18 @@ ipa_passes (void) > passes->all_lto_gen_passes); > > if (!in_lto_p) > - ipa_write_summaries (); > + { > + if (flag_openmp) The following comment applies to several more instances in this patch: after the front end's parsing stage, we should now basically everywhere treat flag_openacc and flag_openmp the same. The idea is that all the existing OpenMP omp_* infrastructure in the middle end and following is now applicable to not only OpenMP but also OpenACC and any other "acceleration" mechanisms. Again, is there a better term to use instead of "acceleration" for describing the union of Cilk+, OpenACC, OpenMP, and similar techniques? Also, would it then make sense to define a flag à la: #define flag_acceleration (flag_openacc | flag_openmp) ..., and begin using that everywhere after the front ends where flag_openmp is currently used? > +/* Select what needs to be dumped. In lto case dump everything. > + In omp target case only dump stuff makrked with attribute. */ > +void > +select_what_to_dump (bool is_omp) Likewise, while this obviously (and unsurprisingly) continues with the existing convention, I'd suggest that in the future we name such things more generically: is_offload or is_acceleration (or, again, any better term that is generically applicable). Or, going by what I've been told before by Jakub and Nathan in <http://news.gmane.org/find-root.php?message_id=%3C523AC6FF.6030007%40acm.org%3E>: »Names are sticky.«, should we instead continue to name all these new things omp_*, too, and declare that the "omp" tag is an artifact of history? (But it certainly is an obstacle to anyone new to the code. I realize that people are a bit tired of refactoring, which recently has been applied in other contexts, and has caused some "churn". But still, I'd personally go ahread and rename the current omp_* middle end bits to any new tag that we agree on, just to make things more clear to everyone new to the code. But, I don't insist on that (by now I manage to internally map omp_* to acceleration_* or similar), so you long-time contributors of course get to have the finaly say about this. I'm only commenting from still a new contributor's point of view.) > diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h > index 797e92e..f4c46db 100644 > --- a/gcc/lto-streamer.h > +++ b/gcc/lto-streamer.h > @@ -139,6 +139,11 @@ along with GCC; see the file COPYING3. If not see > name for the functions and static_initializers. For other types of > sections a '.' and the section type are appended. */ > #define LTO_SECTION_NAME_PREFIX ".gnu.lto_" > +#define OMP_SECTION_NAME_PREFIX ".gnu.target_lto_" Also here, the "target" tag is confusing in that "target" typically has a different meaning in the compiler context -- while this one here is used for implementing OpenMP's target construct, what it technically does should be described by .gnu.offload_gimple_ or somthing similar. (Note that also the LTO term is no longer really applicable, as we're not neccessarily doing link-time optimization here, but instead rather just use the GIMPLE dumping ("offloading") infrastructure that happens to already be implemented in GCC's LTO support. Again, all this doesn't matter to anyone who's already versed with the code, but it does matter to people who are new, and still have to learn all these fine details. (Of which there are many, many more. Yes, I know, life's tough, and I'm used to that, but still.) ;-) And I'm certainly not asking anyone to spend their own time with this refactoring work, but I'd just like to inquire the general permission, the option of allowing me (or anyone else, of course) to do this later on. Also, if you again say: »Names are sticky.«, then that's fine with me, too. Grüße, Thomas
On Mon, Nov 25, 2013 at 05:13:25PM +0100, Thomas Schwinge wrote: > > --- a/gcc/cgraphunit.c > > +++ b/gcc/cgraphunit.c > > @@ -2019,7 +2019,18 @@ ipa_passes (void) > > passes->all_lto_gen_passes); > > > > if (!in_lto_p) > > - ipa_write_summaries (); > > + { > > + if (flag_openmp) > > The following comment applies to several more instances in this patch: > after the front end's parsing stage, we should now basically everywhere > treat flag_openacc and flag_openmp the same. The idea is that all the > existing OpenMP omp_* infrastructure in the middle end and following is > now applicable to not only OpenMP but also OpenACC and any other > "acceleration" mechanisms. Again, is there a better term to use instead > of "acceleration" for describing the union of Cilk+, OpenACC, OpenMP, and > similar techniques? I don't think acceleration is a good term for everything say OpenMP does, and calling OpenMP clauses acceleration clauses just because some other standards decided to copy/modify a subset of the OpenMP syntax is weird. > > Also, would it then make sense to define a flag à la: > > #define flag_acceleration (flag_openacc | flag_openmp) > > ..., and begin using that everywhere after the front ends where > flag_openmp is currently used? We certainly can have some helper macros, but flag_openacc | flag_openmp isn't the right definition of all of them, right now we have flag_openmp, flag_enable_cilkplus (misnamed, should be really flag_cilkplus), flag_openacc and flag_openmp_simd. For some things (e.g. related to offloading, you want flag_openacc | flag_openmp, for others, e.g. SIMD, you want flag_openmp | flag_openmp_simd | flag_cilkplus, for others some different combination. So flag_acceleration would be certainly misleading. > Also here, the "target" tag is confusing in that "target" typically has a > different meaning in the compiler context -- while this one here is used I agree that target word there is weird. > for implementing OpenMP's target construct, what it technically does > should be described by .gnu.offload_gimple_ or somthing similar. (Note > that also the LTO term is no longer really applicable, as we're not > neccessarily doing link-time optimization here, but instead rather just > use the GIMPLE dumping ("offloading") infrastructure that happens to > already be implemented in GCC's LTO support. But LTO is right here IMHO, we are streaming the LTO bytecode there, not GIMPLE. Jakub
On 11/20/2013 10:36 AM, Jakub Jelinek wrote: > On Wed, Nov 20, 2013 at 10:34:30AM +0100, Richard Biener wrote: >> On Tue, Nov 19, 2013 at 10:58 AM, Ilya Tocar <tocarip.intel@gmail.com> wrote: >>> On 14 Nov 11:27, Richard Biener wrote: >>>>> + /* Set when symbol needs to be dumped for lto/offloading. */ >>>>> + unsigned need_dump : 1; >>>>> + >>>> >>>> That's very non-descriptive. What's "offloading"? But yes, something >>>> like this is what I was asking for. >>> >>> I've changed it into: >>> Set when symbol needs to be dumped into LTO bytecode for LTO, >>> or in pragma omp target case, for separate compilation targeting >>> a different architecture. >>> >>> Ok for gomp4 branch now? >> >> Works for me. I'll let branch maintainers decide if it follows whatever >> is done there (I haven't found time to follow stuff here). > > Ok then. We've been working on similar patches for our OpenACC project. The goal is to have functions generated during omp-low that will ultimately execute on a ptx target, write them out using LTO infrastructure and read them back in using a nvptx-none lto1. Unforunately, with multiple teams working in the same area there's obviously going to be some measure of duplication. What I'd like to do is to post a snapshot of what I currently have, to show the general ideas and hopefully get some discussion of what the final picture should look like. The next few mails in reply to this one will contain patches that work towards the following general outline. I've been trying to keep this flexible enough so that it won't be suitable just for the OpenACC work but for whatever else people want to achieve in this area. 1. New configure options are added, --enable-accelerator and --enable-as-accelerator-for. The names are certainly up for discussion. These allow the compiler to know which target combinations are available. The host compiler will be configured with --enable-accelerator, and the offload/accelerator compiler is configured with both options (mostly to ensure they both agree on the spelling of the accelerator target name). 2. Using --enable-as-accelerator-for= changes the install paths, so that the accelerator compilers end up in (for example) bin/x86_64-linux-gnu-accel-nvptx-gcc-4.9.0 libexec/x86_64-linux-gnu/accel/nvptx/4.9.0/lto1 which should keep them separate in case a target can be used both as a normal target and as an accelerator. 3. Some machinery is added to build the accelerator gcc directly in the same tree as the host compiler, in a separate "accel-gcc" subdir. This works for nvptx because that target doesn't even want to build a libgcc. It may not be suitable for other accelerators if they want to build target libraries, but otherwise I think it would be a nice convenience. However, building separately should work fine as well as long as the right options are used for configuring all the involved compilers. 4. We add a vector of target machines to the compiler. Normally this is just initialized to the single machine for which the compiler is configured, but when e.g. OpenACC with an accelerator is enabled, the accelerator machine is added to that list. It should cope fine with multiple different accelerator devices. 5. There's a new DECL_TARGET which refers to this list of target machines. It's set when creating a child function from e.g. "#pragma acc parallel" 6. ipa_write_summaries iterates over DECL_TARGET machines to write out LTO for each of them. LTO sections for a different target get a separate prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...". 7. lto_wrapper recognizes them and calls the various gcc drivers as needed. This is where the series ends, and this step is still incomplete. As mentioned, this patch series is still incomplete and has rough edges, but I hope it will generate discussion. Further details that will need to be addressed are (among others) option handling between compilers for different targets, and slightly rewriting the incoming gimple to be valid for the target (nvptx requires variables to go into various different address spaces). The patches I'll send assume that the present patch from this thread has been reverted, but otherwise they should apply to current gomp-4_0-branch. Thoughts, comments? Does anyone have a good name for these accelerator targets or output targets, something that avoids the overloaded word "target" (I was thinking "destination machine" maybe)? Bernd
On Fri, Nov 29, 2013 at 1:17 PM, Bernd Schmidt <bernds@codesourcery.com> wrote: > On 11/20/2013 10:36 AM, Jakub Jelinek wrote: >> On Wed, Nov 20, 2013 at 10:34:30AM +0100, Richard Biener wrote: >>> On Tue, Nov 19, 2013 at 10:58 AM, Ilya Tocar <tocarip.intel@gmail.com> wrote: >>>> On 14 Nov 11:27, Richard Biener wrote: >>>>>> + /* Set when symbol needs to be dumped for lto/offloading. */ >>>>>> + unsigned need_dump : 1; >>>>>> + >>>>> >>>>> That's very non-descriptive. What's "offloading"? But yes, something >>>>> like this is what I was asking for. >>>> >>>> I've changed it into: >>>> Set when symbol needs to be dumped into LTO bytecode for LTO, >>>> or in pragma omp target case, for separate compilation targeting >>>> a different architecture. >>>> >>>> Ok for gomp4 branch now? >>> >>> Works for me. I'll let branch maintainers decide if it follows whatever >>> is done there (I haven't found time to follow stuff here). >> >> Ok then. > > We've been working on similar patches for our OpenACC project. The goal > is to have functions generated during omp-low that will ultimately > execute on a ptx target, write them out using LTO infrastructure and > read them back in using a nvptx-none lto1. > > Unforunately, with multiple teams working in the same area there's > obviously going to be some measure of duplication. What I'd like to do > is to post a snapshot of what I currently have, to show the general > ideas and hopefully get some discussion of what the final picture should > look like. The next few mails in reply to this one will contain patches > that work towards the following general outline. I've been trying to > keep this flexible enough so that it won't be suitable just for the > OpenACC work but for whatever else people want to achieve in this area. > > 1. New configure options are added, --enable-accelerator and > --enable-as-accelerator-for. The names are certainly up for discussion. > These allow the compiler to know which target combinations are > available. The host compiler will be configured with > --enable-accelerator, and the offload/accelerator compiler is configured > with both options (mostly to ensure they both agree on the spelling of > the accelerator target name). > 2. Using --enable-as-accelerator-for= changes the install paths, so that > the accelerator compilers end up in (for example) > bin/x86_64-linux-gnu-accel-nvptx-gcc-4.9.0 > libexec/x86_64-linux-gnu/accel/nvptx/4.9.0/lto1 > which should keep them separate in case a target can be used both as a > normal target and as an accelerator. > 3. Some machinery is added to build the accelerator gcc directly in the > same tree as the host compiler, in a separate "accel-gcc" subdir. This > works for nvptx because that target doesn't even want to build a libgcc. > It may not be suitable for other accelerators if they want to build > target libraries, but otherwise I think it would be a nice convenience. > However, building separately should work fine as well as long as the > right options are used for configuring all the involved compilers. > 4. We add a vector of target machines to the compiler. Normally this is > just initialized to the single machine for which the compiler is > configured, but when e.g. OpenACC with an accelerator is enabled, the > accelerator machine is added to that list. It should cope fine with > multiple different accelerator devices. > 5. There's a new DECL_TARGET which refers to this list of target > machines. It's set when creating a child function from e.g. "#pragma acc > parallel" > 6. ipa_write_summaries iterates over DECL_TARGET machines to write out > LTO for each of them. LTO sections for a different target get a separate > prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...". > 7. lto_wrapper recognizes them and calls the various gcc drivers as > needed. This is where the series ends, and this step is still incomplete. > > As mentioned, this patch series is still incomplete and has rough edges, > but I hope it will generate discussion. Further details that will need > to be addressed are (among others) option handling between compilers for > different targets, and slightly rewriting the incoming gimple to be > valid for the target (nvptx requires variables to go into various > different address spaces). > > The patches I'll send assume that the present patch from this thread has > been reverted, but otherwise they should apply to current gomp-4_0-branch. > > Thoughts, comments? Does anyone have a good name for these accelerator > targets or output targets, something that avoids the overloaded word > "target" (I was thinking "destination machine" maybe)? Note that we (SUSE/AMD) sofar think we can go an easier route, not adding a real backend that targets HSAIL/BRIG but instead use a custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA based register allocator). Which if course simplifies driving this a bit as we don't need to write/read any GIMPLE. The idea is of course that the "highlevel" target languages, being it HSAIL/BRIG or PTX run through another compiler + optimizer anyway, so machine specific optimization is not necessary (fingers crossing...). Not sure if anybody announced it yet (but gcc-cvs readers may have noticed), there is a 'hsa' branch in svn covering work done sofar (see gcc/README.hsa for how to use it). Richard. > > Bernd >
On 11/29/2013 01:36 PM, Richard Biener wrote: > Note that we (SUSE/AMD) sofar think we can go an easier route, not > adding a real backend that targets HSAIL/BRIG but instead use a > custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA > based register allocator). Which if course simplifies driving this a bit > as we don't need to write/read any GIMPLE. > > The idea is of course that the "highlevel" target languages, being it > HSAIL/BRIG or PTX run through another compiler + optimizer anyway, > so machine specific optimization is not necessary (fingers crossing...). > > Not sure if anybody announced it yet (but gcc-cvs readers may have > noticed), there is a 'hsa' branch in svn covering work done sofar > (see gcc/README.hsa for how to use it). That's also an interesting idea. Did you resurrect the gimple-backend branch that I think existed a while ago? I'm not sure ptx is really high-level enough for that approach to work well though. And gimple looks different for x86 and ptx due to the use of address spaces, so I have doubts whether such an approach would be suitable. Bernd
On Fri, Nov 29, 2013 at 1:50 PM, Bernd Schmidt <bernds@codesourcery.com> wrote: > On 11/29/2013 01:36 PM, Richard Biener wrote: >> Note that we (SUSE/AMD) sofar think we can go an easier route, not >> adding a real backend that targets HSAIL/BRIG but instead use a >> custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA >> based register allocator). Which if course simplifies driving this a bit >> as we don't need to write/read any GIMPLE. >> >> The idea is of course that the "highlevel" target languages, being it >> HSAIL/BRIG or PTX run through another compiler + optimizer anyway, >> so machine specific optimization is not necessary (fingers crossing...). >> >> Not sure if anybody announced it yet (but gcc-cvs readers may have >> noticed), there is a 'hsa' branch in svn covering work done sofar >> (see gcc/README.hsa for how to use it). > > That's also an interesting idea. Did you resurrect the gimple-backend > branch that I think existed a while ago? No - I didn't even know there was one ;) I know of the gimple-frontend branch. > I'm not sure ptx is really high-level enough for that approach to work > well though. And gimple looks different for x86 and ptx due to the use > of address spaces, so I have doubts whether such an approach would be > suitable. We'll see - it's good to have both variants tried. I doubt HSAIL is more high-level than PTX though, but the unified address space you have with the HSA framework certainly simplifies things (not that there isn't talks about optional different memory spaces in the HSA specs ...). Richard. > > Bernd >
Hello Bernd, On 29 Nov 13:17, Bernd Schmidt wrote: > 5. There's a new DECL_TARGET which refers to this list of target > machines. It's set when creating a child function from e.g. "#pragma acc > parallel" Actually, I do not understand, what term `target machine' means here. Are you talking about to target toolchain (target compiler, assembler, linker, libraries etc)? > 6. ipa_write_summaries iterates over DECL_TARGET machines to write out > LTO for each of them. LTO sections for a different target get a separate > prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...". Why we want separate sections for different targets? As far as I understand this is going to be generic Gimple, which should be identical to PTX, MIC etc. We cannot use target built-ins inside such a common regions, right? I also think it worst saying that currently we're working on passing of omp_target sections to target compiler (we call it `streaming in') so we can produce target objects from lto sections containing IR marked to be `target'. Multiple targets are handled by means of dedicated targets descriptor, containing vector of target compilers which will be executed on given sections one-by-one producing set of objects for every target. This sections are not related on exact target, as I mentioned above. We're also working on generation of dedicated tables which will be needed for host<->target address mapping (see Jakub's mails on the subject). Hope to post initial versions nearest wws. -- Thanks, K
On 11/29/2013 02:05 PM, Kirill Yukhin wrote: > On 29 Nov 13:17, Bernd Schmidt wrote: >> 5. There's a new DECL_TARGET which refers to this list of target >> machines. It's set when creating a child function from e.g. "#pragma acc >> parallel" > Actually, I do not understand, what term `target machine' means here. > Are you talking about to target toolchain (target compiler, assembler, linker, > libraries etc)? The idea is that if an x86-linux toolchain is configured with --enable-accelerator=nvptx, there would be two machines in the list - the normal host, x86-linux, and nvptx. We can directly generate code for the normal host, and everything else goes through lto writeout -> lto_wrapper -> target gcc/lto1. >> 6. ipa_write_summaries iterates over DECL_TARGET machines to write out >> LTO for each of them. LTO sections for a different target get a separate >> prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...". > Why we want separate sections for different targets? As far as I understand > this is going to be generic Gimple, which should be identical to PTX, MIC etc. > We cannot use target built-ins inside such a common regions, right? But they need to be read in by a different compiler and fed to the appropriate nvptx-none lto1. I imagine the easiest way to do this is to encode the name in the section and then call multiple different gcc frontends from lto-wrapper, so that's what I've been working towards. > I also think it worst saying that currently we're working on passing of omp_target > sections to target compiler (we call it `streaming in') so we can produce target > objects from lto sections containing IR marked to be `target'. > Multiple targets are handled by means of dedicated targets descriptor, containing vector of > target compilers which will be executed on given sections one-by-one producing set > of objects for every target. > This sections are not related on exact target, as I mentioned above. None of this has been posted yet, correct? If it has, can you point me at the right place in the archive? Bernd
On Fri, Nov 29, 2013 at 01:36:48PM +0100, Richard Biener wrote: > > Thoughts, comments? Does anyone have a good name for these accelerator > > targets or output targets, something that avoids the overloaded word > > "target" (I was thinking "destination machine" maybe)? I think offload is best word here. > Note that we (SUSE/AMD) sofar think we can go an easier route, not > adding a real backend that targets HSAIL/BRIG but instead use a > custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA > based register allocator). Which if course simplifies driving this a bit > as we don't need to write/read any GIMPLE. > > The idea is of course that the "highlevel" target languages, being it > HSAIL/BRIG or PTX run through another compiler + optimizer anyway, > so machine specific optimization is not necessary (fingers crossing...). > > Not sure if anybody announced it yet (but gcc-cvs readers may have > noticed), there is a 'hsa' branch in svn covering work done sofar > (see gcc/README.hsa for how to use it). But you probably don't want to translate GIMPLE right out of IPA into HSAIL/BRIG, do you? And various further passes depend already (well, also the early ones a little bit, but that is something to fix) heavily on targetm.* and target macros, so do you plan to switch targetm to something else and compile again a subset of functions for the HSAIL target? Otherwise, how could you e.g. vectorize code (assuming HSAIL has vector support)? Jakub
On Fri, Nov 29, 2013 at 01:17:56PM +0100, Bernd Schmidt wrote: > We've been working on similar patches for our OpenACC project. The goal > is to have functions generated during omp-low that will ultimately > execute on a ptx target, write them out using LTO infrastructure and > read them back in using a nvptx-none lto1. Please see the past threads about this topic, e.g. "Questions about LTO infrastructure and pragma omp target" thread on gcc ml from August till now, also "Offloading Support in libgomp" and "Target compilation for offloading" It certainly doesn't make sense to invent different infrastructures for OpenMP offloading and for OpenACC offloading, after all, the current OpenACC code on gomp-4_0-branch I think meant to use the libgomp APIs. > 4. We add a vector of target machines to the compiler. Normally this is > just initialized to the single machine for which the compiler is > configured, but when e.g. OpenACC with an accelerator is enabled, the > accelerator machine is added to that list. It should cope fine with > multiple different accelerator devices. This was discussed that it would be nice to allow users during linking (or compilation already?) to choose for which offloading targets code should be compiled, and have a mechanism to use original non-target specific options + have a way to override those for the offloading target. > 5. There's a new DECL_TARGET which refers to this list of target > machines. It's set when creating a child function from e.g. "#pragma acc > parallel" > 6. ipa_write_summaries iterates over DECL_TARGET machines to write out Right now on gomp-4_0-branch a special attribute on the decls (VAR_DECL as well as FUNCTION_DECL) is used for these, but if there are spare bits, something else could be used instead. > LTO for each of them. LTO sections for a different target get a separate > prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...". As you want to dump the GIMPLE IL right out of ~ IPA stage, it should in theory be target independent, so it is undesirable to emit it several times for each offloading target. Instead just stream once and let during linking decide what to support. Jakub
On 11/29/2013 04:16 PM, Jakub Jelinek wrote: > As you want to dump the GIMPLE IL right out of ~ IPA stage, it should in > theory be target independent, so it is undesirable to emit it several times > for each offloading target. That's not what happens. It's just partitioned into disjoint sets, one for each target, which is then written out with the target name encoded in the section name. Bernd
On Fri, Nov 29, 2013 at 05:57:25PM +0100, Bernd Schmidt wrote: > On 11/29/2013 04:16 PM, Jakub Jelinek wrote: > > As you want to dump the GIMPLE IL right out of ~ IPA stage, it should in > > theory be target independent, so it is undesirable to emit it several times > > for each offloading target. > > That's not what happens. It's just partitioned into disjoint sets, one > for each target, which is then written out with the target name encoded > in the section name. But why? Does OpenACC have some way to say, this is to be offloaded for offloading target XYZ? In OpenMP 4.0, you have just host (initial device) code and then some code that is either offloaded if possible, or not (thus, either code/variables are emitted only for the primary target, no offloading, or both for primary target and offloading). But the tought was to stream the generic GIMPLE at IPA time for that (target independent) and only choose later on (during linking) for which offloading targets you actually want to compile it (if any). Jakub
On 11/29/2013 06:03 PM, Jakub Jelinek wrote: > On Fri, Nov 29, 2013 at 05:57:25PM +0100, Bernd Schmidt wrote: >> On 11/29/2013 04:16 PM, Jakub Jelinek wrote: >>> As you want to dump the GIMPLE IL right out of ~ IPA stage, it should in >>> theory be target independent, so it is undesirable to emit it several times >>> for each offloading target. >> >> That's not what happens. It's just partitioned into disjoint sets, one >> for each target, which is then written out with the target name encoded >> in the section name. > > But why? Does OpenACC have some way to say, this is to be offloaded for > offloading target XYZ? In OpenMP 4.0, you have just host (initial device) > code and then some code that is either offloaded if possible, or not > (thus, either code/variables are emitted only for the primary target, no > offloading, or both for primary target and offloading). But the tought was > to stream the generic GIMPLE at IPA time for that (target independent) and > only choose later on (during linking) for which offloading targets you > actually want to compile it (if any). By what mechanism do you choose? This is unclear to me from what I've seen. Does this involve user action, and what's the advantage of doing it this way? The model I was imagining is that we choose the OpenACC target at configure time and things happen automatically from there on. Bernd
On Fri, Nov 29, 2013 at 06:07:38PM +0100, Bernd Schmidt wrote: > By what mechanism do you choose? This is unclear to me from what I've > seen. Does this involve user action, and what's the advantage of doing > it this way? See the 3 threads I've mentioned. The compiler would know the list of available offloading targets (after all, it needs to build libgomp plugins for those targets), and that would be the default, and user could override that through link time command line options (say, ok, while gcc has been configured to support all of hsail-none, ptx-none and x86_64-k10m-linux offloading targets, I only want to support here one of those, and please use these additional options for compilation of that target). Jakub
On Fri, Nov 29, 2013 at 4:03 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Fri, Nov 29, 2013 at 01:36:48PM +0100, Richard Biener wrote: >> > Thoughts, comments? Does anyone have a good name for these accelerator >> > targets or output targets, something that avoids the overloaded word >> > "target" (I was thinking "destination machine" maybe)? > > I think offload is best word here. > >> Note that we (SUSE/AMD) sofar think we can go an easier route, not >> adding a real backend that targets HSAIL/BRIG but instead use a >> custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA >> based register allocator). Which if course simplifies driving this a bit >> as we don't need to write/read any GIMPLE. >> >> The idea is of course that the "highlevel" target languages, being it >> HSAIL/BRIG or PTX run through another compiler + optimizer anyway, >> so machine specific optimization is not necessary (fingers crossing...). >> >> Not sure if anybody announced it yet (but gcc-cvs readers may have >> noticed), there is a 'hsa' branch in svn covering work done sofar >> (see gcc/README.hsa for how to use it). > > But you probably don't want to translate GIMPLE right out of IPA into > HSAIL/BRIG, do you? Actually we do currently. > And various further passes depend already (well, also > the early ones a little bit, but that is something to fix) heavily on > targetm.* and target macros, so do you plan to switch targetm to something > else and compile again a subset of functions for the HSAIL target? > Otherwise, how could you e.g. vectorize code (assuming HSAIL has vector > support)? Yeah, we seem to run host specific passes before HSAIL/BRIG generation so we have to address this somehow. Richard. > > Jakub
On 11/29/2013 06:12 PM, Jakub Jelinek wrote: > On Fri, Nov 29, 2013 at 06:07:38PM +0100, Bernd Schmidt wrote: >> By what mechanism do you choose? This is unclear to me from what I've >> seen. Does this involve user action, and what's the advantage of doing >> it this way? > > See the 3 threads I've mentioned. The compiler would know the list of > available offloading targets (after all, it needs to build libgomp plugins > for those targets), and that would be the default, and user could override > that through link time command line options (say, ok, while gcc has been > configured to support all of hsail-none, ptx-none and x86_64-k10m-linux > offloading targets, I only want to support here one of those, and > please use these additional options for compilation of that target). Ok, IIUC the model is that we just compile all target code for all targets (or a subset of them). Is that correct? In that case I can see how the code that's on the branch now is sufficient; I'd assumed something more fine-grained would be desirable. It would be helpful to see the other pieces of this work if they already exist. Bernd
Hi Bernd, I am working on offloading support for OpenMP4, so I'll try to share my vision of how everything works and answer your questions. GCC compiles host version of the code (as usual) and dumps Gimple, as it does for LTO, but for offloading. Gimple IR is stored only for functions/variables that have target versions - for OpenMP that means the functions/variables under 'omp declare target' pragma and outlined functions from 'omp target' regions. Dumped gimple is then picked up by lto-plugin, which performs compilations for all specified targets (specified either in configure or by compilation flags). (Note that though we use lto-infrastructure, it possible that we don't do any link-time optimizations.) Compiled target images are then embedded into a data-sections of the host binary, and are registered in a global for all targets descriptor. This descriptor is basically an array of pointers to beginning of the images and its sizes. Along with pointers to images, the descriptor contains pointer to functions/globals tables and some other data. Function tables are used to find the correspondence between host and target versions of functions, that could be offloaded. To compile code for target, lto-wrapper calls target compiler, giving it fat-object with dumped gimple as an input. This target compiler could actually be anything that understands Gimple as its input and produces a target image. (Patch for this part hasn't been submitted yet, but I hope to do it in a near future). Another big part of the entire picture is libgomp - a runtime for supporting all this infrastructure. To support different targets, we added plugin-support to libgomp. Thus, libgomp operates in 'general' terms and calls plugin-hooks, when it needs to do anything target-specific (examples of such hooks are: device_run_function, device_init_device, etc.). So, for each target we need to have two important pieces: 1) compiler, that accepts gimple as its input and produces target image 2) plugin for libgomp, which will implement all device-specific part That's just a general overview, so I could explain some parts in more details if you need. Not everything has already been implemented yet, here is the current status (also, very general and maybe rough): - Plugins support in libgomp. Done in gomp4-branch. - Streaming gimple IR into dedicated sections for later use by target compilers. Done in gomp4-branch. - Invoking target compilers and embedding target images into host binary. Work in progress, hope to send the patch soon. - Configure options, compile options for specifying offloading targets. Work in progress. Thanks, Michael On 03 Dec 14:00, Bernd Schmidt wrote: > On 11/29/2013 06:12 PM, Jakub Jelinek wrote: > > On Fri, Nov 29, 2013 at 06:07:38PM +0100, Bernd Schmidt wrote: > >> By what mechanism do you choose? This is unclear to me from what I've > >> seen. Does this involve user action, and what's the advantage of doing > >> it this way? > > > > See the 3 threads I've mentioned. The compiler would know the list of > > available offloading targets (after all, it needs to build libgomp plugins > > for those targets), and that would be the default, and user could override > > that through link time command line options (say, ok, while gcc has been > > configured to support all of hsail-none, ptx-none and x86_64-k10m-linux > > offloading targets, I only want to support here one of those, and > > please use these additional options for compilation of that target). > > Ok, IIUC the model is that we just compile all target code for all > targets (or a subset of them). Is that correct? In that case I can see > how the code that's on the branch now is sufficient; I'd assumed > something more fine-grained would be desirable. > It would be helpful to see the other pieces of this work if they already > exist. > > > Bernd >
diff --git a/gcc/cgraph.h b/gcc/cgraph.h index fb0fe93..9f799f4 100644 --- a/gcc/cgraph.h +++ b/gcc/cgraph.h @@ -105,6 +105,11 @@ public: /* Set when symbol has address taken. */ unsigned address_taken : 1; + /* Set when symbol needs to be dumped into LTO bytecode for LTO, + or in pragma omp target case, for separate compilation targeting + a different architecture. */ + unsigned need_dump : 1; + /* Ordering of all symtab entries. */ int order; diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c index c3a8967..53cd250 100644 --- a/gcc/cgraphunit.c +++ b/gcc/cgraphunit.c @@ -2019,7 +2019,18 @@ ipa_passes (void) passes->all_lto_gen_passes); if (!in_lto_p) - ipa_write_summaries (); + { + if (flag_openmp) + { + section_name_prefix = OMP_SECTION_NAME_PREFIX; + ipa_write_summaries (true); + } + if (flag_lto) + { + section_name_prefix = LTO_SECTION_NAME_PREFIX; + ipa_write_summaries (false); + } + } if (flag_generate_lto) targetm.asm_out.lto_end (); @@ -2110,7 +2121,7 @@ compile (void) cgraph_state = CGRAPH_STATE_IPA; /* If LTO is enabled, initialize the streamer hooks needed by GIMPLE. */ - if (flag_lto) + if (flag_lto || flag_openmp) lto_streamer_hooks_init (); /* Don't run the IPA passes if there was any error or sorry messages. */ diff --git a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c index 4458723..62faa52 100644 --- a/gcc/ipa-inline-analysis.c +++ b/gcc/ipa-inline-analysis.c @@ -3813,7 +3813,7 @@ inline_generate_summary (void) /* When not optimizing, do not bother to analyze. Inlining is still done because edge redirection needs to happen there. */ - if (!optimize && !flag_lto && !flag_wpa) + if (!optimize && !flag_lto && !flag_wpa && !flag_openmp) return; function_insertion_hook_holder = diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c index 6a52da8..697c069 100644 --- a/gcc/lto-cgraph.c +++ b/gcc/lto-cgraph.c @@ -238,6 +238,9 @@ void lto_set_symtab_encoder_in_partition (lto_symtab_encoder_t encoder, symtab_node *node) { + /* Ignore not needed nodes. */ + if (!node->need_dump) + return; int index = lto_symtab_encoder_encode (encoder, node); encoder->nodes[index].in_partition = true; } @@ -751,6 +754,17 @@ add_references (lto_symtab_encoder_t encoder, lto_symtab_encoder_encode (encoder, ref->referred); } +/* Select what needs to be dumped. In lto case dump everything. + In omp target case only dump stuff makrked with attribute. */ +void +select_what_to_dump (bool is_omp) +{ + struct symtab_node *snode; + FOR_EACH_SYMBOL(snode) + snode->need_dump = !is_omp || lookup_attribute ("omp declare target", + DECL_ATTRIBUTES (snode->decl)); +} + /* Find all symbols we want to stream into given partition and insert them to encoders. diff --git a/gcc/lto-streamer.c b/gcc/lto-streamer.c index 1540e4c..ffafb0e 100644 --- a/gcc/lto-streamer.c +++ b/gcc/lto-streamer.c @@ -43,6 +43,7 @@ struct lto_stats_d lto_stats; static bitmap_obstack lto_obstack; static bool lto_obstack_initialized; +const char *section_name_prefix = LTO_SECTION_NAME_PREFIX; /* Return a string representing LTO tag TAG. */ @@ -172,7 +173,7 @@ lto_get_section_name (int section_type, const char *name, struct lto_file_decl_d sprintf (post, "." HOST_WIDE_INT_PRINT_HEX_PURE, f->id); else sprintf (post, "." HOST_WIDE_INT_PRINT_HEX_PURE, get_random_seed (false)); - return concat (LTO_SECTION_NAME_PREFIX, sep, add, post, NULL); + return concat (section_name_prefix, sep, add, post, NULL); } @@ -310,7 +311,7 @@ lto_streamer_init (void) bool gate_lto_out (void) { - return ((flag_generate_lto || in_lto_p) + return ((flag_generate_lto || in_lto_p || flag_openmp) /* Don't bother doing anything if the program has errors. */ && !seen_error ()); } diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h index 797e92e..f4c46db 100644 --- a/gcc/lto-streamer.h +++ b/gcc/lto-streamer.h @@ -139,6 +139,11 @@ along with GCC; see the file COPYING3. If not see name for the functions and static_initializers. For other types of sections a '.' and the section type are appended. */ #define LTO_SECTION_NAME_PREFIX ".gnu.lto_" +#define OMP_SECTION_NAME_PREFIX ".gnu.target_lto_" + +/* Can be either OMP_SECTION_NAME_PREFIX when we stream pragma omp target + stuff, or LTO_SECTION_NAME_PREFIX for lto case. */ +extern const char *section_name_prefix; #define LTO_major_version 2 #define LTO_minor_version 2 @@ -895,6 +900,7 @@ bool referenced_from_this_partition_p (struct ipa_ref_list *, bool reachable_from_this_partition_p (struct cgraph_node *, lto_symtab_encoder_t); lto_symtab_encoder_t compute_ltrans_boundary (lto_symtab_encoder_t encoder); +void select_what_to_dump (bool); /* In lto-symtab.c. */ diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c index 6a3d881..2d2aa63 100644 --- a/gcc/lto/lto-partition.c +++ b/gcc/lto/lto-partition.c @@ -190,6 +190,7 @@ add_symbol_to_partition_1 (ltrans_partition part, symtab_node *node) gcc_assert (c != SYMBOL_EXTERNAL && (c == SYMBOL_DUPLICATE || !symbol_partitioned_p (node))); + node->need_dump = true; lto_set_symtab_encoder_in_partition (part->encoder, node); if (symbol_partitioned_p (node)) @@ -917,6 +918,8 @@ lto_promote_cross_file_statics (void) gcc_assert (flag_wpa); + select_what_to_dump (false); + /* First compute boundaries. */ n_sets = ltrans_partitions.length (); for (i = 0; i < n_sets; i++) diff --git a/gcc/passes.c b/gcc/passes.c index 19e5869..88b1538 100644 --- a/gcc/passes.c +++ b/gcc/passes.c @@ -2335,7 +2335,7 @@ ipa_write_summaries_1 (lto_symtab_encoder_t encoder) /* Write out summaries for all the nodes in the callgraph. */ void -ipa_write_summaries (void) +ipa_write_summaries (bool is_omp) { lto_symtab_encoder_t encoder; int i, order_pos; @@ -2343,9 +2343,11 @@ ipa_write_summaries (void) struct cgraph_node *node; struct cgraph_node **order; - if (!flag_generate_lto || seen_error ()) + if (!(flag_generate_lto || flag_openmp) || seen_error () ) return; + select_what_to_dump (is_omp); + encoder = lto_symtab_encoder_new (false); /* Create the callgraph set in the same order used in diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index fa403c7..8d51d80 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -595,7 +595,7 @@ extern void pass_fini_dump_file (struct opt_pass *); extern const char *get_current_pass_name (void); extern void print_current_pass (FILE *); extern void debug_pass (void); -extern void ipa_write_summaries (void); +extern void ipa_write_summaries (bool is_omp); extern void ipa_write_optimization_summaries (struct lto_symtab_encoder_d *); extern void ipa_read_summaries (void); extern void ipa_read_optimization_summaries (void);