Message ID | Zc29VHpxUFCDu2Yq@tucnak |
---|---|
State | New |
Headers | show |
Series | icf: Reset SSA_NAME_{PTR,RANGE}_INFO in successfully merged functions [PR113907] | expand |
Hi! On Thu, Feb 15, 2024 at 08:29:24AM +0100, Jakub Jelinek wrote: > 2024-02-15 Jakub Jelinek <jakub@redhat.com> > > PR middle-end/113907 > * ipa-icf.cc (sem_item_optimizer::merge_classes): Reset > SSA_NAME_RANGE_INFO and SSA_NAME_PTR_INFO on successfully ICF merged > functions. > > * gcc.dg/pr113907.c: New test. I'd like to ping the https://gcc.gnu.org/pipermail/gcc-patches/2024-February/645644.html patch. After looking at this PR again yesterday, I'm convinced we don't need either this patch or some other patch to deal with the jump functions, but both, this patch to clear (or other variant would be to union them) SSA_NAME_RANGE_INFO in the bodies of IPA-ICF merged functions, and another patch that would maybe in sem_function::merge go through all the callees cgraph edges and for each of them attempt to merge (for value range union) at least the value range/pointer alignment information from the corresponding cgraph edge from alias for all jump functions of all the arguments. Bootstrapped/regtested again last night. > --- gcc/ipa-icf.cc.jj 2024-02-14 14:26:11.101933914 +0100 > +++ gcc/ipa-icf.cc 2024-02-14 16:49:35.141518117 +0100 > @@ -3396,6 +3397,7 @@ sem_item_optimizer::merge_classes (unsig > continue; > > sem_item *source = c->members[0]; > + bool this_merged_p = false; > > if (DECL_NAME (source->decl) > && MAIN_NAME_P (DECL_NAME (source->decl))) > @@ -3443,7 +3445,7 @@ sem_item_optimizer::merge_classes (unsig > if (dbg_cnt (merged_ipa_icf)) > { > bool merged = source->merge (alias); > - merged_p |= merged; > + this_merged_p |= merged; > > if (merged && alias->type == VAR) > { > @@ -3452,6 +3454,35 @@ sem_item_optimizer::merge_classes (unsig > } > } > } > + > + merged_p |= this_merged_p; > + if (this_merged_p > + && source->type == FUNC > + && (!flag_wpa || flag_checking)) > + { > + unsigned i; > + tree name; > + FOR_EACH_SSA_NAME (i, name, DECL_STRUCT_FUNCTION (source->decl)) > + { > + /* We need to either merge or reset SSA_NAME_*_INFO. > + For merging we don't preserve the mapping between > + original and alias SSA_NAMEs from successful equals > + calls. */ > + if (POINTER_TYPE_P (TREE_TYPE (name))) > + { > + if (SSA_NAME_PTR_INFO (name)) > + { > + gcc_checking_assert (!flag_wpa); > + SSA_NAME_PTR_INFO (name) = NULL; > + } > + } > + else if (SSA_NAME_RANGE_INFO (name)) > + { > + gcc_checking_assert (!flag_wpa); > + SSA_NAME_RANGE_INFO (name) = NULL; > + } > + } > + } > } > > if (!m_merged_variables.is_empty ()) > --- gcc/testsuite/gcc.dg/pr113907.c.jj 2024-02-14 16:13:48.486555159 +0100 > +++ gcc/testsuite/gcc.dg/pr113907.c 2024-02-14 16:13:29.198825045 +0100 > @@ -0,0 +1,49 @@ > +/* PR middle-end/113907 */ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > +/* { dg-additional-options "-minline-all-stringops" { target i?86-*-* x86_64-*-* } } */ > + > +static inline int > +foo (int len, void *indata, void *outdata) > +{ > + if (len < 0 || (len & 7) != 0) > + return 0; > + if (len != 0 && indata != outdata) > + __builtin_memcpy (outdata, indata, len); > + return len; > +} > + > +static inline int > +bar (int len, void *indata, void *outdata) > +{ > + if (len < 0 || (len & 1) != 0) > + return 0; > + if (len != 0 && indata != outdata) > + __builtin_memcpy (outdata, indata, len); > + return len; > +} > + > +int (*volatile p1) (int, void *, void *) = foo; > +int (*volatile p2) (int, void *, void *) = bar; > + > +__attribute__((noipa)) int > +baz (int len, void *indata, void *outdata) > +{ > + if ((len & 6) != 0) > + bar (len, indata, outdata); > + else > + foo (len, indata, outdata); > +} > + > +struct S { char buf[64]; } s __attribute__((aligned (8))); > + > +int > +main () > +{ > + for (int i = 0; i < 64; ++i) > + s.buf[i] = ' ' + i; > + p2 (2, s.buf, s.buf + 33); > + for (int i = 0; i < 64; ++i) > + if (s.buf[i] != ' ' + ((i >= 33 && i < 35) ? i - 33 : i)) > + __builtin_abort (); > +} Jakub
> Hi! Hi, > > On Thu, Feb 15, 2024 at 08:29:24AM +0100, Jakub Jelinek wrote: > > 2024-02-15 Jakub Jelinek <jakub@redhat.com> > > > > PR middle-end/113907 > > * ipa-icf.cc (sem_item_optimizer::merge_classes): Reset > > SSA_NAME_RANGE_INFO and SSA_NAME_PTR_INFO on successfully ICF merged > > functions. > > > > * gcc.dg/pr113907.c: New test. > > I'd like to ping the > https://gcc.gnu.org/pipermail/gcc-patches/2024-February/645644.html > patch. > After looking at this PR again yesterday, I'm convinced we don't need > either this patch or some other patch to deal with the jump functions, > but both, this patch to clear (or other variant would be to union them) > SSA_NAME_RANGE_INFO in the bodies of IPA-ICF merged functions, and another > patch that would maybe in sem_function::merge go through all the > callees cgraph edges and for each of them attempt to merge (for value > range union) at least the value range/pointer alignment information from > the corresponding cgraph edge from alias for all jump functions of all > the arguments. > > Bootstrapped/regtested again last night. I am sorry for delaying this. I made the variant that simply compares value range of functions and prevents merging if they diverge and wanted to make some bigger statistics. This made me notice some performance problems on clang performance and libstdc++ RB-trees which disrailed me from the original PR. I will finish the statistics today. For next stage1 we definitly want to move ahead with merging metadata (not only value ranges, but hopefully also TBAA). Currently the only thing we merge aggressively is the edge profile (and we have PR on that merging functions which differs only but likely/unlikely attribute). However for backportability and for this avoiding merging may be safer solution depending on the stats.A I was looking into ipa-vrp useability and there are several tests on Firefox talos where ipa-prop VRP makes difference. It boils down to propagating fact that parameter is alway snon-NULL. This is commonly tested in C++ casts. Honza
On Tue, Mar 12, 2024 at 10:46:42AM +0100, Jan Hubicka wrote: > I am sorry for delaying this. I made the variant that simply compares > value range of functions and prevents merging if they diverge and wanted > to make some bigger statistics. This made me notice some performance > problems on clang performance and libstdc++ RB-trees which disrailed me > from the original PR. I will finish the statistics today. With the posted patch, perhaps if we don't want to union jump_tables etc., all we could punt on is differences in the jump_table VRs rather than just any SSA_NAME_RANGE_INFO differences. But I agree that for GCC 15 we really want to merge rather than punt. > For next stage1 we definitly want to move ahead with merging metadata > (not only value ranges, but hopefully also TBAA). Currently the only > thing we merge aggressively is the edge profile (and we have PR on that > merging functions which differs only but likely/unlikely attribute). > However for backportability and for this avoiding merging may be safer > solution depending on the stats.A > > I was looking into ipa-vrp useability and there are several tests on > Firefox talos where ipa-prop VRP makes difference. It boils down to > propagating fact that parameter is alway snon-NULL. This is commonly > tested in C++ casts. Jakub
On Tue, Mar 12, 2024 at 05:21:58PM +0100, Jakub Jelinek wrote: > On Tue, Mar 12, 2024 at 10:46:42AM +0100, Jan Hubicka wrote: > > I am sorry for delaying this. I made the variant that simply compares > > value range of functions and prevents merging if they diverge and wanted > > to make some bigger statistics. This made me notice some performance > > problems on clang performance and libstdc++ RB-trees which disrailed me > > from the original PR. I will finish the statistics today. > > With the posted patch, perhaps if we don't want to union jump_tables etc., > all we could punt on is differences in the jump_table VRs rather than just > any SSA_NAME_RANGE_INFO differences. To expand on this, I think we need to either union or punt on jump_func differences in any case, because for LTO we can't really punt on SSA_NAME_RANGE_INFO differences given that we don't stream that out and in. So the ipa_jump_func are I think the only thing that actually can differ on the ICF merging candidates from value range POV. Jakub
On Tue, 12 Mar 2024, Jakub Jelinek wrote: > On Tue, Mar 12, 2024 at 05:21:58PM +0100, Jakub Jelinek wrote: > > On Tue, Mar 12, 2024 at 10:46:42AM +0100, Jan Hubicka wrote: > > > I am sorry for delaying this. I made the variant that simply compares > > > value range of functions and prevents merging if they diverge and wanted > > > to make some bigger statistics. This made me notice some performance > > > problems on clang performance and libstdc++ RB-trees which disrailed me > > > from the original PR. I will finish the statistics today. > > > > With the posted patch, perhaps if we don't want to union jump_tables etc., > > all we could punt on is differences in the jump_table VRs rather than just > > any SSA_NAME_RANGE_INFO differences. > > To expand on this, I think we need to either union or punt on jump_func > differences in any case, because for LTO we can't really punt on > SSA_NAME_RANGE_INFO differences given that we don't stream that out and in. > So the ipa_jump_func are I think the only thing that actually can differ > on the ICF merging candidates from value range POV. I agree. Btw, I would have approved the original patch in this thread that wipes SSA_NAME_INFO in merged bodies to mimic what LTO effectively does right now. That also looks most sensible to backport. But I'll defer to Honza in the end (but also want to point out we need something suitable for backporting). Richard.
> On Tue, 12 Mar 2024, Jakub Jelinek wrote: > > > On Tue, Mar 12, 2024 at 05:21:58PM +0100, Jakub Jelinek wrote: > > > On Tue, Mar 12, 2024 at 10:46:42AM +0100, Jan Hubicka wrote: > > > > I am sorry for delaying this. I made the variant that simply compares > > > > value range of functions and prevents merging if they diverge and wanted > > > > to make some bigger statistics. This made me notice some performance > > > > problems on clang performance and libstdc++ RB-trees which disrailed me > > > > from the original PR. I will finish the statistics today. > > > > > > With the posted patch, perhaps if we don't want to union jump_tables etc., > > > all we could punt on is differences in the jump_table VRs rather than just > > > any SSA_NAME_RANGE_INFO differences. > > > > To expand on this, I think we need to either union or punt on jump_func > > differences in any case, because for LTO we can't really punt on > > SSA_NAME_RANGE_INFO differences given that we don't stream that out and in. I noticed that yesterday too (I added my jump function testcase to testsuit and it fails with -flto, too). I implemented comparator for them too and run the stats. There was over 3000 functions in bootstrap where we run into differences in value-range and about 150k in LLVM build. Inspecting random examples shown that those are usually false positives (pair of functions that are different but triggers hash colision) caused by the fact that we do not hash PHI arguments, so code like int test (int a) { return a>0 ? CST1: CST2; } gets same hash value no matter what CST1/CST2 is. I added hasher and I am re-running stats. > > So the ipa_jump_func are I think the only thing that actually can differ > > on the ICF merging candidates from value range POV. > > I agree. Btw, I would have approved the original patch in this > thread that wipes SSA_NAME_INFO in merged bodies to mimic what LTO > effectively does right now. That also looks most sensible to > backport. > > But I'll defer to Honza in the end (but also want to point out we > need something suitable for backporting). My main worry here is that I tried to relax matching of IL metadata in the past and it triggers interesting problems. (I implemented TBAA merging and one needs to match additional things in summaries and loop structures) If value range differs at IPA analysis time, we need to be sure that we compare everything that possibly depends on it. So it is always safer to just compare more than try to merge. Which is what we do in all cases so far. Here, for the first time, is the problem is with LTO streaming missing the data though. Thinking more about it, I wonder if different value ranges can be exploited to cause different decisions about function being finite (confuse pure/const) or different outcome of alias analysis yielding to different aggregate jump functions (confusing ipa-prop). I will try to build testcases today. Honza > > Richard.
On Wed, 13 Mar 2024, Jan Hubicka wrote: > > On Tue, 12 Mar 2024, Jakub Jelinek wrote: > > > > > On Tue, Mar 12, 2024 at 05:21:58PM +0100, Jakub Jelinek wrote: > > > > On Tue, Mar 12, 2024 at 10:46:42AM +0100, Jan Hubicka wrote: > > > > > I am sorry for delaying this. I made the variant that simply compares > > > > > value range of functions and prevents merging if they diverge and wanted > > > > > to make some bigger statistics. This made me notice some performance > > > > > problems on clang performance and libstdc++ RB-trees which disrailed me > > > > > from the original PR. I will finish the statistics today. > > > > > > > > With the posted patch, perhaps if we don't want to union jump_tables etc., > > > > all we could punt on is differences in the jump_table VRs rather than just > > > > any SSA_NAME_RANGE_INFO differences. > > > > > > To expand on this, I think we need to either union or punt on jump_func > > > differences in any case, because for LTO we can't really punt on > > > SSA_NAME_RANGE_INFO differences given that we don't stream that out and in. > > I noticed that yesterday too (I added my jump function testcase to > testsuit and it fails with -flto, too). I implemented comparator for > them too and run the stats. There was over 3000 functions in bootstrap > where we run into differences in value-range and about 150k in LLVM > build. > > Inspecting random examples shown that those are usually false positives > (pair of functions that are different but triggers hash colision) caused > by the fact that we do not hash PHI arguments, so code like > > int test (int a) > { > return a>0 ? CST1: CST2; > } > > gets same hash value no matter what CST1/CST2 is. I added hasher and I > am re-running stats. The hash should be commutative here at least. > > > So the ipa_jump_func are I think the only thing that actually can differ > > > on the ICF merging candidates from value range POV. > > > > I agree. Btw, I would have approved the original patch in this > > thread that wipes SSA_NAME_INFO in merged bodies to mimic what LTO > > effectively does right now. That also looks most sensible to > > backport. > > > > But I'll defer to Honza in the end (but also want to point out we > > need something suitable for backporting). > > My main worry here is that I tried to relax matching of IL metadata in > the past and it triggers interesting problems. (I implemented TBAA > merging and one needs to match additional things in summaries and loop > structures) > > If value range differs at IPA analysis time, we need to be sure that we > compare everything that possibly depends on it. So it is always safer to > just compare more than try to merge. Which is what we do in all cases so > far. Here, for the first time, is the problem is with LTO streaming > missing the data though. > > Thinking more about it, I wonder if different value ranges can be > exploited to cause different decisions about function being finite > (confuse pure/const) or different outcome of alias analysis yielding to > different aggregate jump functions (confusing ipa-prop). The obvious thing would be range info making it possible to prove a stmt cannot trap, say for an array index or for arithmetic with -ftrapv. But I'm not sure that makes a differnce for pure/const-ness. Making something looping vs. non-looping should be easy though, just have range info for a loop with an unsigned IV that evolves like { 0, +, increment } and with a != exit condition where for some 'increment' we know we eventually reach equality but with some other we know we never do. But that just means pure/const state needs to be recomputed after merging? Or compared. Richard.
On Wed, Mar 13, 2024 at 10:55:07AM +0100, Jan Hubicka wrote: > > > So the ipa_jump_func are I think the only thing that actually can differ > > > on the ICF merging candidates from value range POV. > > > > I agree. Btw, I would have approved the original patch in this > > thread that wipes SSA_NAME_INFO in merged bodies to mimic what LTO > > effectively does right now. That also looks most sensible to > > backport. > > > > But I'll defer to Honza in the end (but also want to point out we > > need something suitable for backporting). > > My main worry here is that I tried to relax matching of IL metadata in > the past and it triggers interesting problems. (I implemented TBAA > merging and one needs to match additional things in summaries and loop > structures) The point of the patch is that it emulates what happens with LTO (though, does that only for successful ICF merges), because LTO streaming ignores SSA_NAME_{RANGE,PTR}_INFO. So, because with LTO all you have in the IL is the m_vr in jump_tables, pure/const analysis results, whatever else might be derived on the side from the range info, you need to punt or union all that information anyway, otherwise it will misbehave with LTO. So punting on SSA_NAME_{RANGE,PTR}_INFO differences instead of throwing it away means that non-LTO will get fewer ICF merges than LTO unnecessarily, it doesn't improve anything for the code correctness at least for the LTO case. Jakub
> On Wed, Mar 13, 2024 at 10:55:07AM +0100, Jan Hubicka wrote: > > > > So the ipa_jump_func are I think the only thing that actually can differ > > > > on the ICF merging candidates from value range POV. > > > > > > I agree. Btw, I would have approved the original patch in this > > > thread that wipes SSA_NAME_INFO in merged bodies to mimic what LTO > > > effectively does right now. That also looks most sensible to > > > backport. > > > > > > But I'll defer to Honza in the end (but also want to point out we > > > need something suitable for backporting). > > > > My main worry here is that I tried to relax matching of IL metadata in > > the past and it triggers interesting problems. (I implemented TBAA > > merging and one needs to match additional things in summaries and loop > > structures) > > The point of the patch is that it emulates what happens with LTO (though, > does that only for successful ICF merges), because LTO streaming ignores > SSA_NAME_{RANGE,PTR}_INFO. > So, because with LTO all you have in the IL is the m_vr in jump_tables, > pure/const analysis results, whatever else might be derived on the side > from the range info, you need to punt or union all that information anyway, > otherwise it will misbehave with LTO. > So punting on SSA_NAME_{RANGE,PTR}_INFO differences instead of throwing it > away means that non-LTO will get fewer ICF merges than LTO unnecessarily, > it doesn't improve anything for the code correctness at least for the LTO > case. We have wrong code with LTO, too. The problem is that IPA passes (and not only that, loop analysis too) does analysis at compile time (with value numbers in) and streams the info separately. Removal of value ranges (either by LTO or by your patch) happens between computing these summaries and using them, so this can be used to trigger wrong code, sadly. Honza > > Jakub >
On Wed, Mar 13, 2024 at 12:18:45PM +0100, Jan Hubicka wrote: > > On Wed, Mar 13, 2024 at 10:55:07AM +0100, Jan Hubicka wrote: > > > > > So the ipa_jump_func are I think the only thing that actually can differ > > > > > on the ICF merging candidates from value range POV. > > > > > > > > I agree. Btw, I would have approved the original patch in this > > > > thread that wipes SSA_NAME_INFO in merged bodies to mimic what LTO > > > > effectively does right now. That also looks most sensible to > > > > backport. > > > > > > > > But I'll defer to Honza in the end (but also want to point out we > > > > need something suitable for backporting). > > > > > > My main worry here is that I tried to relax matching of IL metadata in > > > the past and it triggers interesting problems. (I implemented TBAA > > > merging and one needs to match additional things in summaries and loop > > > structures) > > > > The point of the patch is that it emulates what happens with LTO (though, > > does that only for successful ICF merges), because LTO streaming ignores > > SSA_NAME_{RANGE,PTR}_INFO. > > So, because with LTO all you have in the IL is the m_vr in jump_tables, > > pure/const analysis results, whatever else might be derived on the side > > from the range info, you need to punt or union all that information anyway, > > otherwise it will misbehave with LTO. > > So punting on SSA_NAME_{RANGE,PTR}_INFO differences instead of throwing it > > away means that non-LTO will get fewer ICF merges than LTO unnecessarily, > > it doesn't improve anything for the code correctness at least for the LTO > > case. > > We have wrong code with LTO, too. I know. > The problem is that IPA passes (and > not only that, loop analysis too) does analysis at compile time (with > value numbers in) and streams the info separately. And that is desirable, because otherwise it simply couldn't derive any ranges. > Removal of value ranges > (either by LTO or by your patch) happens between computing these > summaries and using them, so this can be used to trigger wrong code, > sadly. Yes. But with LTO, I don't see how the IPA ICF comparison whether two functions are the same or not could be done with the SSA_NAME_{RANGE,PTR}_INFO in, otherwise it could only ICF merge functions from the same TUs. So the comparison IMHO (and the assert checks in my patch prove that) is done when the SSA_NAME_{RANGE,PTR}_INFO aren't in anymore. So, one just needs to compare and punt or union whatever is or could be influenced in the IPA streamed data from the ranges etc. And because one has to do it for LTO, doing it for non-LTO should be sufficient too. Jakub
> > int test (int a) > > { > > return a>0 ? CST1: CST2; > > } > > > > gets same hash value no matter what CST1/CST2 is. I added hasher and I > > am re-running stats. > > The hash should be commutative here at least. It needs to match what comparator is doing later, and sadly it does not try to find the right order of basic blocks. I added TODO and will fix it next stage1. With the attached patch we have no comparsion with mismatching VRP ranges with non-LTO bootstrap and there are 4 instances of mismatched jump function with LTO bootstrap. I checked them and these are functions that are still different and would not be unified anyway. In testsuite we have gcc.c-torture/execute/builtin-prefetch-4.c this matches [irange] long unsigned int [0, 2147483647][18446744071562067968, +INF] [irange] long unsigned int [0, 2147483646][18446744071562067968, +INF] in postdec_glob_idx/22new vs predec_glob_idx/20 The functions are different, but have precisely same statements, just differ by SSA names that we do not hash. c-c++-common/builtins.c [irange] long unsigned int [2, 2147483647] MASK 0x7ffffffe VALUE 0x0 [irange] long unsigned int [8, 2147483647] MASK 0x7ffffff8 VALUE 0x0 in bar.part.0/10new foo.part.0/9 This is a real mismatch 23_containers/vector/cons/2.cc [irange] long unsigned int [1, 9223372036854775807] MASK 0x7fffffffffffffff VALUE 0x0 [irange] long unsigned int [1, 9223372036854775807] MASK 0xfffffffffffffff8 VALUE 0x0 static std::vector<_Tp, _Alloc>::pointer std::vector<_Tp, _Alloc>::_S_do_relocate(pointer, pointer, pointer, _Tp_alloc_type&, std::true_type) [with _Tp = A<B>; _Alloc = std::allocator<A<B> >]/3482 static std::vector<_Tp, _Alloc>::pointer std::vector<_Tp, _Alloc>::_S_do_relocate(pointer, pointer, pointer, _Tp_alloc_type&, std::true_type) [with _Tp = double; _Alloc = std::allocator<double>]/3372 Here funtions are different initially but are optimized to same body while we keep info about ranges from optimized out code. I tried to make easy testcase, but int a; a = (char)a; is not optimized away, I wonder why, when for all valid values this is noop? 25_algorithms/lexicographical_compare/uchar.cc [irange] long unsigned int [8, +INF] MASK 0xfffffffffffffff8 VALUE 0x0 [irange] long unsigned int [4, +INF] MASK 0xfffffffffffffffc VALUE 0x0 static constexpr bool std::__equal<true>::equal(const _Tp*, const _Tp*, const _Tp*) [with _Tp = long int]/13669 static constexpr bool std::__equal<true>::equal(const _Tp*, const _Tp*, const _Tp*) [with _Tp = int]/13663 std/ranges/conv/1.cc [irange] long unsigned int [8, +INF] MASK 0xfffffffffffffff8 VALUE 0x0 [irange] long unsigned int [4, +INF] MASK 0xfffffffffffffffc VALUE 0x0 static constexpr bool std::__equal<true>::equal(const _Tp*, const _Tp*, const _Tp*) [with _Tp = long int]/13757 static constexpr bool std::__equal<true>::equal(const _Tp*, const _Tp*, const _Tp*) [with _Tp = int]/13751 Those two are also happening in llvm. We try to merge two functions which take pointer parameters of different type. boolD.2783 std::__equal<true>::equal<int>D.426577 (const intD.9 * __first1D.433380, const intD.9 * __last1D.433381, const intD.9 * __first2D.433382) { const intD.9 * __first1_4(D) = __first1D.433380; const intD.9 * __last1_3(D) = __last1D.433381; const intD.9 * __first2_6(D) = __first2D.433382; long intD.12 _1; boolD.2783 _2; long unsigned intD.16 _7; boolD.2783 _8; intD.9 _9; _1 = __last1_3(D) - __first1_4(D); if (_1 != 0) goto <bb 3>; [50.00%] else goto <bb 4>; [50.00%] # RANGE [irange] long unsigned int [4, +INF] MASK 0xfffffffffffffffc VALUE 0x0 _7 = (long unsigned intD.16) _1; Compared to boolD.2783 std::__equal<true>::equal<long int>D.426685 (const long intD.12 * __first1D.433472, const long intD.12 * __last1D.433473, const long intD.12 * __first2D.433474) { const long intD.12 * __first1_4(D) = __first1D.433472; const long intD.12 * __last1_3(D) = __last1D.433473; const long intD.12 * __first2_6(D) = __first2D.433474; long intD.12 _1; boolD.2783 _2; long unsigned intD.16 _7; boolD.2783 _8; intD.9 _9; _1 = __last1_3(D) - __first1_4(D); if (_1 != 0) goto <bb 3>; [50.00%] else goto <bb 4>; [50.00%] # RANGE [irange] long unsigned int [8, +INF] MASK 0xfffffffffffffff8 VALUE 0x0 _7 = (long unsigned intD.16) _1; This looks like potentially dangerous situation, since if we can derive range from pointed-to type, then ICF needs to match them too. However with #include <stdlib.h> size_t diff (long *a, long *b) { long int d = (b - a) * sizeof (long); if (!d) abort (); return d; } size_t diff2 (int *a, int *b) { long int d = (b - a) * sizeof (int); if (!d) abort (); return d; } I get range [1, +INF] What seems to be happening in the testcase is that we inline intD.9 std::__memcmp<long int, long int>D.433477 (const long intD.12 * __first1D.438292, const long intD.12 * __first2D.438293, size_tD.2817 __numD.438294) { # PT = nonlocal null const long intD.12 * __first1_1(D) = __first1D.438292; # PT = nonlocal null const long intD.12 * __first2_2(D) = __first2D.438293; size_tD.2817 __num_3(D) = __numD.438294; long unsigned intD.16 _5; intD.9 _6; ;; basic block 2, loop depth 0, count 1073741824 (estimated locally, freq 1.0000), maybe hot ;; prev block 0, next block 1, flags: (NEW, VISITED) ;; pred: ENTRY [always] count:1073741824 (estimated locally, freq 1.0000) (FALLTHRU,EXECUTABLE) # RANGE [irange] long unsigned int [0, 0][8, +INF] MASK 0xfffffffffffffff8 VALUE 0x0 _5 = __num_3(D) * 8; So the range here is determined from multiplication by 8 and while this multiplication is later optimized out after inlining range remains. So VRP should be unable to determine the range again after ICF. > > The obvious thing would be range info making it possible to prove > a stmt cannot trap, say for an array index or for arithmetic with > -ftrapv. But I'm not sure that makes a differnce for pure/const-ness. I am looking into this. > > Making something looping vs. non-looping should be easy though, > just have range info for a loop with an unsigned IV that evolves > like { 0, +, increment } and with a != exit condition where for some > 'increment' we know we eventually reach equality but with some > other we know we never do. Yep, but here we seem to be safe since finite_p flag of every loop is compared. > > But that just means pure/const state needs to be recomputed after > merging? Or compared. Recomputing summaries is also possible (by triggering node removal and node insertion pair), but I would rather stay with oriignal summaries if possible. This is the patch for hashing PHI nodes. It can be abused to trigger quadratic behaviour, so I plan to commit it. Bootstrapped/regtested x86_64-linux. gcc/ChangeLog: * ipa-icf.cc (sem_function::init): Hash PHI operands. (sem_function::compare_phi_node): Add comment. diff --git a/gcc/ipa-icf.cc b/gcc/ipa-icf.cc index 5d5a42f9c6c..a0266755992 100644 --- a/gcc/ipa-icf.cc +++ b/gcc/ipa-icf.cc @@ -1387,6 +1387,23 @@ sem_function::init (ipa_icf_gimple::func_checker *checker) cfg_checksum = iterative_hash_host_wide_int (e->flags, cfg_checksum); + /* TODO: We should be able to match PHIs with different order of + parameters. This needs to be also updated in + sem_function::compare_phi_node. */ + gphi_iterator si; + for (si = gsi_start_nonvirtual_phis (bb); !gsi_end_p (si); + gsi_next_nonvirtual_phi (&si)) + { + hstate.add_int (GIMPLE_PHI); + gphi *phi = si.phi (); + m_checker->hash_operand (gimple_phi_result (phi), hstate, 0, + func_checker::OP_NORMAL); + hstate.add_int (gimple_phi_num_args (phi)); + for (unsigned int i = 0; i < gimple_phi_num_args (phi); i++) + m_checker->hash_operand (gimple_phi_arg_def (phi, i), + hstate, 0, func_checker::OP_NORMAL); + } + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { @@ -1579,6 +1596,8 @@ sem_function::compare_phi_node (basic_block bb1, basic_block bb2) if (size1 != size2) return return_false (); + /* TODO: We should be able to match PHIs with different order of + parameters. This needs to be also updated in sem_function::init. */ for (i = 0; i < size1; ++i) { t1 = gimple_phi_arg (phi1, i)->def;
> > We have wrong code with LTO, too. > > I know. > > > The problem is that IPA passes (and > > not only that, loop analysis too) does analysis at compile time (with > > value numbers in) and streams the info separately. > > And that is desirable, because otherwise it simply couldn't derive any > ranges. > > > Removal of value ranges > > (either by LTO or by your patch) happens between computing these > > summaries and using them, so this can be used to trigger wrong code, > > sadly. > > Yes. But with LTO, I don't see how the IPA ICF comparison whether > two functions are the same or not could be done with the > SSA_NAME_{RANGE,PTR}_INFO in, otherwise it could only ICF merge functions > from the same TUs. So the comparison IMHO (and the assert checks in my > patch prove that) is done when the SSA_NAME_{RANGE,PTR}_INFO aren't in > anymore. So, one just needs to compare and punt or union whatever > is or could be influenced in the IPA streamed data from the ranges etc. > And because one has to do it for LTO, doing it for non-LTO should be > sufficient too. Sorry, this was bit of a misunderstanding: I tought you still considered the original patch to be full fix, while I tought I should look into it more and dig out more issues. This is bit of can of worms. Overall I think the plan is: This stage4 1) fix VR divergences by either comparing or droping them 2) fix jump function differences by comparing them (I just constructed a testcase showing that jump functions can diverge for other reasons than just VR, so this is deeper problem, too) 3) try to construct aditional wrong code testcases (finite_p divergences, trapping) Next stage1 4) implement VR and PTR info streaming 5) make ICF to compare VRs and punt otherwise 6) implement optimize_size feature to ICF that will not punt on diverging TBAA or value ranges and do merging instead. We need some extra infrastructure for that, being able to keep the maps between basic blocks and SSA names from comparsion stage to merging stage 7) measure how effective ICF becomes in optimize_size mode and implement heuristics on how much metadata merging we want to do with -O2/-O3 as well. Based on older data Martin collected for his thesis, ICF used to be must more effective on libreoffice then it is today, so hopefully we can recover 10-15% binary size improvmeents here. 8) General ICF TLC. There are many half finished things for a while in this pass (such as merging with different BB or stmt orders). I am attaching the compare patch which also fixes the original wrong code. If you preffer your version, just go ahead to commit it. Otherwise I will add your testcase for this patch and commit this one. Statistically we almost never merge functions with different value ranges (three in testsuite, 0 during bootstrap, 1 during LTO bootstrap and probably few in LLVM build - there are 15 cases reported, but some are false positives caused by hash function conflicts). Honza gcc/ChangeLog: * ipa-icf-gimple.cc (func_checker::compare_ssa_name): Compare value ranges. diff --git a/gcc/ipa-icf-gimple.cc b/gcc/ipa-icf-gimple.cc index 8c2df7a354e..37c416d777d 100644 --- a/gcc/ipa-icf-gimple.cc +++ b/gcc/ipa-icf-gimple.cc @@ -39,9 +39,11 @@ along with GCC; see the file COPYING3. If not see #include "cfgloop.h" #include "attribs.h" #include "gimple-walk.h" +#include "value-query.h" +#include "value-range-storage.h" #include "tree-ssa-alias-compare.h" #include "ipa-icf-gimple.h" namespace ipa_icf_gimple { @@ -109,6 +116,35 @@ func_checker::compare_ssa_name (const_tree t1, const_tree t2) else if (m_target_ssa_names[i2] != (int) i1) return false; + if (POINTER_TYPE_P (TREE_TYPE (t1))) + { + if (SSA_NAME_PTR_INFO (t1)) + { + if (!SSA_NAME_PTR_INFO (t2) + || SSA_NAME_PTR_INFO (t1)->align != SSA_NAME_PTR_INFO (t2)->align + || SSA_NAME_PTR_INFO (t1)->misalign != SSA_NAME_PTR_INFO (t2)->misalign) + return false; + } + else if (SSA_NAME_PTR_INFO (t2)) + return false; + } + else + { + if (SSA_NAME_RANGE_INFO (t1)) + { + if (!SSA_NAME_RANGE_INFO (t2)) + return false; + Value_Range r1 (TREE_TYPE (t1)); + Value_Range r2 (TREE_TYPE (t2)); + SSA_NAME_RANGE_INFO (t1)->get_vrange (r1, TREE_TYPE (t1)); + SSA_NAME_RANGE_INFO (t2)->get_vrange (r2, TREE_TYPE (t2)); + if (r1 != r2) + return false; + } + else if (SSA_NAME_RANGE_INFO (t2)) + return false; + } + if (SSA_NAME_IS_DEFAULT_DEF (t1)) { tree b1 = SSA_NAME_VAR (t1);
On Thu, Mar 14, 2024 at 05:16:59PM +0100, Jan Hubicka wrote: > Sorry, this was bit of a misunderstanding: I tought you still considered > the original patch to be full fix, while I tought I should look into it > more and dig out more issues. This is bit of can of worms. Overall I > think the plan is: > > This stage4 > 1) fix VR divergences by either comparing or droping them > 2) fix jump function differences by comparing them > (I just constructed a testcase showing that jump functions can > diverge for other reasons than just VR, so this is deeper problem, > too) > 3) try to construct aditional wrong code testcases (finite_p > divergences, trapping) > Next stage1 > 4) implement VR and PTR info streaming > 5) make ICF to compare VRs and punt otherwise > 6) implement optimize_size feature to ICF that will not punt on > diverging TBAA or value ranges and do merging instead. > We need some extra infrastructure for that, being able to keep the > maps between basic blocks and SSA names from comparsion stage to > merging stage > 7) measure how effective ICF becomes in optimize_size mode and implement > heuristics on how much metadata merging we want to do with -O2/-O3 as > well. > Based on older data Martin collected for his thesis, ICF used to be > must more effective on libreoffice then it is today, so hopefully we > can recover 10-15% binary size improvmeents here. > 8) General ICF TLC. There are many half finished things for a while in > this pass (such as merging with different BB or stmt orders). Agreed. > I am attaching the compare patch which also fixes the original wrong > code. If you preffer your version, just go ahead to commit it. Seems both patches are the same size (at least when looking at number of added lines). I think I prefer my patch because it will make the LTO and non-LTO cases more similar which IMHO helps maintainance of the release branches. So, e.g. for all the other wrong-code issues like https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113907#c54 or https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113907#c58 one can actually trigger them with both non-LTO and LTO, rather than only with LTO and for non-LTO not trigger because there are SSA_NAME_RANGE_INFO differences that prevent ICF. If we start streaming SSA_NAME_RANGE_INFO in GCC 15, that changes things and then we can decide what to do for differences, 5) or 6) or combinations thereof (e.g. consider not just if the ranges are different, but how much too or if one is a subset or superset of the other to decide between punting and unioning for non-Os). > Otherwise > I will add your testcase for this patch and commit this one. > Statistically we almost never merge functions with different value > ranges (three in testsuite, 0 during bootstrap, 1 during LTO bootstrap > and probably few in LLVM build - there are 15 cases reported, but some > are false positives caused by hash function conflicts). It is mostly the fnsplit functions, sure, because we don't really drop the __builtin_unreachable checks before IPA and so the if (cond) __builtin_unreachable (); style assertions or [[assume (cond)]]; still result in ICF failures. Jakub
> > Otherwise > > I will add your testcase for this patch and commit this one. > > Statistically we almost never merge functions with different value > > ranges (three in testsuite, 0 during bootstrap, 1 during LTO bootstrap > > and probably few in LLVM build - there are 15 cases reported, but some > > are false positives caused by hash function conflicts). > > It is mostly the fnsplit functions, sure, because we don't really drop > the __builtin_unreachable checks before IPA and so the if (cond) > __builtin_unreachable (); style assertions or [[assume (cond)]]; still > result in ICF failures. Actually on llvm they are not split functions, but functions where value ranges were determined by early VRP based on code optimized out by time we reach ipa-icf (the equal thingy from libstdc++ I wrote about in previous email) Honza
On Thu, 14 Mar 2024, Jan Hubicka wrote: > > > We have wrong code with LTO, too. > > > > I know. > > > > > The problem is that IPA passes (and > > > not only that, loop analysis too) does analysis at compile time (with > > > value numbers in) and streams the info separately. > > > > And that is desirable, because otherwise it simply couldn't derive any > > ranges. > > > > > Removal of value ranges > > > (either by LTO or by your patch) happens between computing these > > > summaries and using them, so this can be used to trigger wrong code, > > > sadly. > > > > Yes. But with LTO, I don't see how the IPA ICF comparison whether > > two functions are the same or not could be done with the > > SSA_NAME_{RANGE,PTR}_INFO in, otherwise it could only ICF merge functions > > from the same TUs. So the comparison IMHO (and the assert checks in my > > patch prove that) is done when the SSA_NAME_{RANGE,PTR}_INFO aren't in > > anymore. So, one just needs to compare and punt or union whatever > > is or could be influenced in the IPA streamed data from the ranges etc. > > And because one has to do it for LTO, doing it for non-LTO should be > > sufficient too. > > Sorry, this was bit of a misunderstanding: I tought you still considered > the original patch to be full fix, while I tought I should look into it > more and dig out more issues. This is bit of can of worms. Overall I > think the plan is: > > This stage4 > 1) fix VR divergences by either comparing or droping them > 2) fix jump function differences by comparing them > (I just constructed a testcase showing that jump functions can > diverge for other reasons than just VR, so this is deeper problem, > too) > 3) try to construct aditional wrong code testcases (finite_p > divergences, trapping) > Next stage1 > 4) implement VR and PTR info streaming > 5) make ICF to compare VRs and punt otherwise > 6) implement optimize_size feature to ICF that will not punt on > diverging TBAA or value ranges and do merging instead. > We need some extra infrastructure for that, being able to keep the > maps between basic blocks and SSA names from comparsion stage to > merging stage > 7) measure how effective ICF becomes in optimize_size mode and implement > heuristics on how much metadata merging we want to do with -O2/-O3 as > well. > Based on older data Martin collected for his thesis, ICF used to be > must more effective on libreoffice then it is today, so hopefully we > can recover 10-15% binary size improvmeents here. > 8) General ICF TLC. There are many half finished things for a while in > this pass (such as merging with different BB or stmt orders). > > I am attaching the compare patch which also fixes the original wrong > code. If you preffer your version, just go ahead to commit it. Otherwise > I will add your testcase for this patch and commit this one. > Statistically we almost never merge functions with different value > ranges (three in testsuite, 0 during bootstrap, 1 during LTO bootstrap > and probably few in LLVM build - there are 15 cases reported, but some > are false positives caused by hash function conflicts). > > Honza > > gcc/ChangeLog: > > * ipa-icf-gimple.cc (func_checker::compare_ssa_name): Compare value ranges. > > diff --git a/gcc/ipa-icf-gimple.cc b/gcc/ipa-icf-gimple.cc > index 8c2df7a354e..37c416d777d 100644 > --- a/gcc/ipa-icf-gimple.cc > +++ b/gcc/ipa-icf-gimple.cc > @@ -39,9 +39,11 @@ along with GCC; see the file COPYING3. If not see > #include "cfgloop.h" > #include "attribs.h" > #include "gimple-walk.h" > +#include "value-query.h" > +#include "value-range-storage.h" > > #include "tree-ssa-alias-compare.h" > #include "ipa-icf-gimple.h" > > namespace ipa_icf_gimple { > > @@ -109,6 +116,35 @@ func_checker::compare_ssa_name (const_tree t1, const_tree t2) > else if (m_target_ssa_names[i2] != (int) i1) > return false; > > + if (POINTER_TYPE_P (TREE_TYPE (t1))) > + { > + if (SSA_NAME_PTR_INFO (t1)) > + { > + if (!SSA_NAME_PTR_INFO (t2) > + || SSA_NAME_PTR_INFO (t1)->align != SSA_NAME_PTR_INFO (t2)->align > + || SSA_NAME_PTR_INFO (t1)->misalign != SSA_NAME_PTR_INFO (t2)->misalign) You want to compare SSA_NAME_PTR_INFO ()->pt.zero as well I think since we store pointer non-null-ness from VRP there. You are not comparing the actual points-to info - but of course I'd expect that to differ as soon as local decls are involved. Still looks like a hole to me. > + return false; > + } > + else if (SSA_NAME_PTR_INFO (t2)) > + return false; > + } > + else > + { > + if (SSA_NAME_RANGE_INFO (t1)) > + { > + if (!SSA_NAME_RANGE_INFO (t2)) > + return false; > + Value_Range r1 (TREE_TYPE (t1)); > + Value_Range r2 (TREE_TYPE (t2)); > + SSA_NAME_RANGE_INFO (t1)->get_vrange (r1, TREE_TYPE (t1)); > + SSA_NAME_RANGE_INFO (t2)->get_vrange (r2, TREE_TYPE (t2)); > + if (r1 != r2) > + return false; > + } > + else if (SSA_NAME_RANGE_INFO (t2)) > + return false; > + } > + > if (SSA_NAME_IS_DEFAULT_DEF (t1)) > { > tree b1 = SSA_NAME_VAR (t1); >
> > + if (POINTER_TYPE_P (TREE_TYPE (t1))) > > + { > > + if (SSA_NAME_PTR_INFO (t1)) > > + { > > + if (!SSA_NAME_PTR_INFO (t2) > > + || SSA_NAME_PTR_INFO (t1)->align != SSA_NAME_PTR_INFO (t2)->align > > + || SSA_NAME_PTR_INFO (t1)->misalign != SSA_NAME_PTR_INFO (t2)->misalign) > > You want to compare SSA_NAME_PTR_INFO ()->pt.zero as well I think since > we store pointer non-null-ness from VRP there. > > You are not comparing the actual points-to info - but of course I'd > expect that to differ as soon as local decls are involved. Still looks > like a hole to me. I convinced myself that we don't need to do that since we recompute PTA after IPA stage: unlike value ranges it is thrown away and recomputed rather then just refined from prevoius solution. But indeed if parts are persistent, we need to compare it (and should stream it to LTO I guess). Honza
--- gcc/ipa-icf.cc.jj 2024-02-14 14:26:11.101933914 +0100 +++ gcc/ipa-icf.cc 2024-02-14 16:49:35.141518117 +0100 @@ -3396,6 +3397,7 @@ sem_item_optimizer::merge_classes (unsig continue; sem_item *source = c->members[0]; + bool this_merged_p = false; if (DECL_NAME (source->decl) && MAIN_NAME_P (DECL_NAME (source->decl))) @@ -3443,7 +3445,7 @@ sem_item_optimizer::merge_classes (unsig if (dbg_cnt (merged_ipa_icf)) { bool merged = source->merge (alias); - merged_p |= merged; + this_merged_p |= merged; if (merged && alias->type == VAR) { @@ -3452,6 +3454,35 @@ sem_item_optimizer::merge_classes (unsig } } } + + merged_p |= this_merged_p; + if (this_merged_p + && source->type == FUNC + && (!flag_wpa || flag_checking)) + { + unsigned i; + tree name; + FOR_EACH_SSA_NAME (i, name, DECL_STRUCT_FUNCTION (source->decl)) + { + /* We need to either merge or reset SSA_NAME_*_INFO. + For merging we don't preserve the mapping between + original and alias SSA_NAMEs from successful equals + calls. */ + if (POINTER_TYPE_P (TREE_TYPE (name))) + { + if (SSA_NAME_PTR_INFO (name)) + { + gcc_checking_assert (!flag_wpa); + SSA_NAME_PTR_INFO (name) = NULL; + } + } + else if (SSA_NAME_RANGE_INFO (name)) + { + gcc_checking_assert (!flag_wpa); + SSA_NAME_RANGE_INFO (name) = NULL; + } + } + } } if (!m_merged_variables.is_empty ()) --- gcc/testsuite/gcc.dg/pr113907.c.jj 2024-02-14 16:13:48.486555159 +0100 +++ gcc/testsuite/gcc.dg/pr113907.c 2024-02-14 16:13:29.198825045 +0100 @@ -0,0 +1,49 @@ +/* PR middle-end/113907 */ +/* { dg-do run } */ +/* { dg-options "-O2" } */ +/* { dg-additional-options "-minline-all-stringops" { target i?86-*-* x86_64-*-* } } */ + +static inline int +foo (int len, void *indata, void *outdata) +{ + if (len < 0 || (len & 7) != 0) + return 0; + if (len != 0 && indata != outdata) + __builtin_memcpy (outdata, indata, len); + return len; +} + +static inline int +bar (int len, void *indata, void *outdata) +{ + if (len < 0 || (len & 1) != 0) + return 0; + if (len != 0 && indata != outdata) + __builtin_memcpy (outdata, indata, len); + return len; +} + +int (*volatile p1) (int, void *, void *) = foo; +int (*volatile p2) (int, void *, void *) = bar; + +__attribute__((noipa)) int +baz (int len, void *indata, void *outdata) +{ + if ((len & 6) != 0) + bar (len, indata, outdata); + else + foo (len, indata, outdata); +} + +struct S { char buf[64]; } s __attribute__((aligned (8))); + +int +main () +{ + for (int i = 0; i < 64; ++i) + s.buf[i] = ' ' + i; + p2 (2, s.buf, s.buf + 33); + for (int i = 0; i < 64; ++i) + if (s.buf[i] != ' ' + ((i >= 33 && i < 35) ? i - 33 : i)) + __builtin_abort (); +}