diff mbox series

AArch64 fix regexp for live_1.c sve test

Message ID patch-17577-tamar@arm.com
State New
Headers show
Series AArch64 fix regexp for live_1.c sve test | expand

Commit Message

Tamar Christina July 18, 2023, 2:43 p.m. UTC
Hi All,

The resulting predicate register of a whilelo is not
restricted to the lower half of the predicate register file.

As such these tests started failing after recent changes
because the whilelo outside the loop is getting assigned p15.

This widens the regexp.

Tested on aarch64-none-linux-gnu and passes again.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/live_1.c: Update assembly.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
index 80ee176d1807bf628ad47551d69ff5d84deda79e..2db6c3c209a9514646e92628f3d2dd58d466539c 100644




--
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
index 80ee176d1807bf628ad47551d69ff5d84deda79e..2db6c3c209a9514646e92628f3d2dd58d466539c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
@@ -27,10 +27,10 @@
 
 TEST_ALL (EXTRACT_LAST)
 
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, } 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].h, } 4 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].s, } 4 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].d, } 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.b, } 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.h, } 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.s, } 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.d, } 4 } } */
 
 /* { dg-final { scan-assembler-times {\tlastb\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tlastb\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */

Comments

Richard Sandiford July 20, 2023, 5:44 a.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> The resulting predicate register of a whilelo is not
> restricted to the lower half of the predicate register file.
>
> As such these tests started failing after recent changes
> because the whilelo outside the loop is getting assigned p15.

It's the whilelo in the loop for me.  We go from:

.L3:
        ld1b    z31.b, p7/z, [x4, x3]
        movprfx z30, z31
        mul     z30.b, p5/m, z30.b, z29.b
        st1b    z30.b, p7, [x4, x3]
        mov     p6.b, p7.b
        add     x3, x3, x0
        whilelo p7.b, w3, w1
        b.any   .L3

to:

.L3:
        ld1b    z31.b, p7/z, [x3, x2]
        movprfx z29, z31
        mul     z29.b, p6/m, z29.b, z30.b
        st1b    z29.b, p7, [x3, x2]
        add     x2, x2, x0
        whilelo p15.b, w2, w1
        b.any   .L4
        [...]
        .p2align 2,,3
.L4:
        mov     p7.b, p15.b
        b       .L3

This adds an extra (admittedly unconditional) branch to every non-final
vector iteration, which seems unfortunate.  I don't think we'd see
p8-p15 otherwise, since the result of the whilelo is used as a
governing predicate by the next iteration of the loop.

This happens because the scalar loop is given an 89% chance of iterating.
Previously we gave the vector loop an 83.33% chance of iterating, whereas
after 061f74c06735e1fa35b910ae we give it a 12% chance.  0.89^16 == 15.50%,
so the new probabilities definitely preserve the original probabilities
more closely.  But for purely heuristic probabilities like these, I'm
not sure we should lean so heavily into the idea that the vector
latch is unlikely.

Honza, Richi, any thoughts?  Just wanted to double-check that this
was operating as expected before making the tests accept the (arguably)
less efficient code.  It looks like the commit was more aimed at fixing
the profile counts for the epilogues, rather than the main loop.

Thanks,
Richard

> This widens the regexp.
>
> Tested on aarch64-none-linux-gnu and passes again.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/live_1.c: Update assembly.
>
> --- inline copy of patch -- 
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> index 80ee176d1807bf628ad47551d69ff5d84deda79e..2db6c3c209a9514646e92628f3d2dd58d466539c 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> @@ -27,10 +27,10 @@
>  
>  TEST_ALL (EXTRACT_LAST)
>  
> -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, } 2 } } */
> -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].h, } 4 } } */
> -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].s, } 4 } } */
> -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].d, } 4 } } */
> +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.b, } 2 } } */
> +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.h, } 4 } } */
> +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.s, } 4 } } */
> +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.d, } 4 } } */
>  
>  /* { dg-final { scan-assembler-times {\tlastb\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
>  /* { dg-final { scan-assembler-times {\tlastb\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
Richard Biener July 20, 2023, 7:20 a.m. UTC | #2
On Thu, 20 Jul 2023, Richard Sandiford wrote:

> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > The resulting predicate register of a whilelo is not
> > restricted to the lower half of the predicate register file.
> >
> > As such these tests started failing after recent changes
> > because the whilelo outside the loop is getting assigned p15.
> 
> It's the whilelo in the loop for me.  We go from:
> 
> .L3:
>         ld1b    z31.b, p7/z, [x4, x3]
>         movprfx z30, z31
>         mul     z30.b, p5/m, z30.b, z29.b
>         st1b    z30.b, p7, [x4, x3]
>         mov     p6.b, p7.b
>         add     x3, x3, x0
>         whilelo p7.b, w3, w1
>         b.any   .L3
> 
> to:
> 
> .L3:
>         ld1b    z31.b, p7/z, [x3, x2]
>         movprfx z29, z31
>         mul     z29.b, p6/m, z29.b, z30.b
>         st1b    z29.b, p7, [x3, x2]
>         add     x2, x2, x0
>         whilelo p15.b, w2, w1
>         b.any   .L4
>         [...]
>         .p2align 2,,3
> .L4:
>         mov     p7.b, p15.b
>         b       .L3
> 
> This adds an extra (admittedly unconditional) branch to every non-final
> vector iteration, which seems unfortunate.  I don't think we'd see
> p8-p15 otherwise, since the result of the whilelo is used as a
> governing predicate by the next iteration of the loop.
> 
> This happens because the scalar loop is given an 89% chance of iterating.
> Previously we gave the vector loop an 83.33% chance of iterating, whereas
> after 061f74c06735e1fa35b910ae we give it a 12% chance.  0.89^16 == 15.50%,
> so the new probabilities definitely preserve the original probabilities
> more closely.  But for purely heuristic probabilities like these, I'm
> not sure we should lean so heavily into the idea that the vector
> latch is unlikely.
> 
> Honza, Richi, any thoughts?  Just wanted to double-check that this
> was operating as expected before making the tests accept the (arguably)
> less efficient code.  It looks like the commit was more aimed at fixing
> the profile counts for the epilogues, rather than the main loop.

The above looks like a failed coalescing, can you track down where
that happens and why?

And yes, the profile counts were supposed to be fixed, but not only
for the epilog but for header copying also for the main loop.  Not
sure if anything goes wrong here though - for estimates of course
it's only estimates and IIRC we estimate a loop to iterate 4 times
when we don't know better.

Richard.

> Thanks,
> Richard
> 
> > This widens the regexp.
> >
> > Tested on aarch64-none-linux-gnu and passes again.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/sve/live_1.c: Update assembly.
> >
> > --- inline copy of patch -- 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> > index 80ee176d1807bf628ad47551d69ff5d84deda79e..2db6c3c209a9514646e92628f3d2dd58d466539c 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> > @@ -27,10 +27,10 @@
> >  
> >  TEST_ALL (EXTRACT_LAST)
> >  
> > -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, } 2 } } */
> > -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].h, } 4 } } */
> > -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].s, } 4 } } */
> > -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].d, } 4 } } */
> > +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.b, } 2 } } */
> > +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.h, } 4 } } */
> > +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.s, } 4 } } */
> > +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.d, } 4 } } */
> >  
> >  /* { dg-final { scan-assembler-times {\tlastb\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
> >  /* { dg-final { scan-assembler-times {\tlastb\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
>
Jan Hubicka July 20, 2023, 8:26 a.m. UTC | #3
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > The resulting predicate register of a whilelo is not
> > restricted to the lower half of the predicate register file.
> >
> > As such these tests started failing after recent changes
> > because the whilelo outside the loop is getting assigned p15.
> 
> It's the whilelo in the loop for me.  We go from:
> 
> .L3:
>         ld1b    z31.b, p7/z, [x4, x3]
>         movprfx z30, z31
>         mul     z30.b, p5/m, z30.b, z29.b
>         st1b    z30.b, p7, [x4, x3]
>         mov     p6.b, p7.b
>         add     x3, x3, x0
>         whilelo p7.b, w3, w1
>         b.any   .L3
> 
> to:
> 
> .L3:
>         ld1b    z31.b, p7/z, [x3, x2]
>         movprfx z29, z31
>         mul     z29.b, p6/m, z29.b, z30.b
>         st1b    z29.b, p7, [x3, x2]
>         add     x2, x2, x0
>         whilelo p15.b, w2, w1
>         b.any   .L4
>         [...]
>         .p2align 2,,3
> .L4:
>         mov     p7.b, p15.b
>         b       .L3
> 
> This adds an extra (admittedly unconditional) branch to every non-final
> vector iteration, which seems unfortunate.  I don't think we'd see
> p8-p15 otherwise, since the result of the whilelo is used as a
> governing predicate by the next iteration of the loop.
> 
> This happens because the scalar loop is given an 89% chance of iterating.
> Previously we gave the vector loop an 83.33% chance of iterating, whereas
> after 061f74c06735e1fa35b910ae we give it a 12% chance.  0.89^16 == 15.50%,
> so the new probabilities definitely preserve the original probabilities
> more closely.  But for purely heuristic probabilities like these, I'm
> not sure we should lean so heavily into the idea that the vector
> latch is unlikely.
> 
> Honza, Richi, any thoughts?  Just wanted to double-check that this
> was operating as expected before making the tests accept the (arguably)
> less efficient code.  It looks like the commit was more aimed at fixing
> the profile counts for the epilogues, rather than the main loop.

You are right that we shold not scale down static profiles in case they
are artifically flat. It is nice to have actual testcase.
Old code used to test:

  /* Without profile feedback, loops for which we do not know a better estimate
     are assumed to roll 10 times.  When we unroll such loop, it appears to
     roll too little, and it may even seem to be cold.  To avoid this, we
     ensure that the created loop appears to roll at least 5 times (but at
     most as many times as before unrolling).  Don't do adjustment if profile
     feedback is present.  */
  if (new_est_niter < 5 && !profile_p)
    {
      if (est_niter < 5)
        new_est_niter = est_niter;
      else 
        new_est_niter = 5;
    } 

This is not right when profile feedback is around and also when we
managed to determine precise #of itrations at branch prediction time and
did not cap.

So I replaced it iwht the test that adjusted header count is not smaller
than the preheader edge count.  However this will happily get loop
iteration count close to 0.

It is bit hard to figure out if profile is realistic:

Sometimes we do
   profile_status_for_fn (cfun) != PROFILE_READ
I am trying to get rid of this test.  With LTO or when comdat profile is
lost we inline together functions with and without profile.

We can test for quality of loop header count to be precise or adjusted.
However at the time vectorizer is modifying loop profile we already
adjusted it for the initial conditional for profitability threshold and
drop it to GUESSED.Even with profile feedback we do not know outcome
probability of that one (Ondrej Kubanek's histograms will help here).

So I think we want to check if we have loop iteration estimate recorded
(that should be true for both profile feedback and loops with known trip
count) and if so compare it what profile says and it is more or less in
match consider profile realistic.  This needs to be done before
vectorizer starts tampering with the loop.

I will try to make patch for that.
Honza
> 
> Thanks,
> Richard
> 
> > This widens the regexp.
> >
> > Tested on aarch64-none-linux-gnu and passes again.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/sve/live_1.c: Update assembly.
> >
> > --- inline copy of patch -- 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> > index 80ee176d1807bf628ad47551d69ff5d84deda79e..2db6c3c209a9514646e92628f3d2dd58d466539c 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
> > @@ -27,10 +27,10 @@
> >  
> >  TEST_ALL (EXTRACT_LAST)
> >  
> > -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, } 2 } } */
> > -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].h, } 4 } } */
> > -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].s, } 4 } } */
> > -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].d, } 4 } } */
> > +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.b, } 2 } } */
> > +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.h, } 4 } } */
> > +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.s, } 4 } } */
> > +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.d, } 4 } } */
> >  
> >  /* { dg-final { scan-assembler-times {\tlastb\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
> >  /* { dg-final { scan-assembler-times {\tlastb\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
Richard Sandiford July 20, 2023, 9:14 a.m. UTC | #4
Richard Biener <rguenther@suse.de> writes:
> On Thu, 20 Jul 2023, Richard Sandiford wrote:
>
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > The resulting predicate register of a whilelo is not
>> > restricted to the lower half of the predicate register file.
>> >
>> > As such these tests started failing after recent changes
>> > because the whilelo outside the loop is getting assigned p15.
>> 
>> It's the whilelo in the loop for me.  We go from:
>> 
>> .L3:
>>         ld1b    z31.b, p7/z, [x4, x3]
>>         movprfx z30, z31
>>         mul     z30.b, p5/m, z30.b, z29.b
>>         st1b    z30.b, p7, [x4, x3]
>>         mov     p6.b, p7.b
>>         add     x3, x3, x0
>>         whilelo p7.b, w3, w1
>>         b.any   .L3
>> 
>> to:
>> 
>> .L3:
>>         ld1b    z31.b, p7/z, [x3, x2]
>>         movprfx z29, z31
>>         mul     z29.b, p6/m, z29.b, z30.b
>>         st1b    z29.b, p7, [x3, x2]
>>         add     x2, x2, x0
>>         whilelo p15.b, w2, w1
>>         b.any   .L4
>>         [...]
>>         .p2align 2,,3
>> .L4:
>>         mov     p7.b, p15.b
>>         b       .L3
>> 
>> This adds an extra (admittedly unconditional) branch to every non-final
>> vector iteration, which seems unfortunate.  I don't think we'd see
>> p8-p15 otherwise, since the result of the whilelo is used as a
>> governing predicate by the next iteration of the loop.
>> 
>> This happens because the scalar loop is given an 89% chance of iterating.
>> Previously we gave the vector loop an 83.33% chance of iterating, whereas
>> after 061f74c06735e1fa35b910ae we give it a 12% chance.  0.89^16 == 15.50%,
>> so the new probabilities definitely preserve the original probabilities
>> more closely.  But for purely heuristic probabilities like these, I'm
>> not sure we should lean so heavily into the idea that the vector
>> latch is unlikely.
>> 
>> Honza, Richi, any thoughts?  Just wanted to double-check that this
>> was operating as expected before making the tests accept the (arguably)
>> less efficient code.  It looks like the commit was more aimed at fixing
>> the profile counts for the epilogues, rather than the main loop.
>
> The above looks like a failed coalescing, can you track down where
> that happens and why?

Ah, sorry, I shouldn't have trimmed the context.  The previous predicate
(p6 in the original code) is live on exit from the loop, while the
whilelo result is live on the latch edge.  So I think a move is needed
somewhere.

Thanks,
Richard
Richard Sandiford July 20, 2023, 9:16 a.m. UTC | #5
Jan Hubicka <hubicka@ucw.cz> writes:
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > The resulting predicate register of a whilelo is not
>> > restricted to the lower half of the predicate register file.
>> >
>> > As such these tests started failing after recent changes
>> > because the whilelo outside the loop is getting assigned p15.
>> 
>> It's the whilelo in the loop for me.  We go from:
>> 
>> .L3:
>>         ld1b    z31.b, p7/z, [x4, x3]
>>         movprfx z30, z31
>>         mul     z30.b, p5/m, z30.b, z29.b
>>         st1b    z30.b, p7, [x4, x3]
>>         mov     p6.b, p7.b
>>         add     x3, x3, x0
>>         whilelo p7.b, w3, w1
>>         b.any   .L3
>> 
>> to:
>> 
>> .L3:
>>         ld1b    z31.b, p7/z, [x3, x2]
>>         movprfx z29, z31
>>         mul     z29.b, p6/m, z29.b, z30.b
>>         st1b    z29.b, p7, [x3, x2]
>>         add     x2, x2, x0
>>         whilelo p15.b, w2, w1
>>         b.any   .L4
>>         [...]
>>         .p2align 2,,3
>> .L4:
>>         mov     p7.b, p15.b
>>         b       .L3
>> 
>> This adds an extra (admittedly unconditional) branch to every non-final
>> vector iteration, which seems unfortunate.  I don't think we'd see
>> p8-p15 otherwise, since the result of the whilelo is used as a
>> governing predicate by the next iteration of the loop.
>> 
>> This happens because the scalar loop is given an 89% chance of iterating.
>> Previously we gave the vector loop an 83.33% chance of iterating, whereas
>> after 061f74c06735e1fa35b910ae we give it a 12% chance.  0.89^16 == 15.50%,
>> so the new probabilities definitely preserve the original probabilities
>> more closely.  But for purely heuristic probabilities like these, I'm
>> not sure we should lean so heavily into the idea that the vector
>> latch is unlikely.
>> 
>> Honza, Richi, any thoughts?  Just wanted to double-check that this
>> was operating as expected before making the tests accept the (arguably)
>> less efficient code.  It looks like the commit was more aimed at fixing
>> the profile counts for the epilogues, rather than the main loop.
>
> You are right that we shold not scale down static profiles in case they
> are artifically flat. It is nice to have actual testcase.
> Old code used to test:
>
>   /* Without profile feedback, loops for which we do not know a better estimate
>      are assumed to roll 10 times.  When we unroll such loop, it appears to
>      roll too little, and it may even seem to be cold.  To avoid this, we
>      ensure that the created loop appears to roll at least 5 times (but at
>      most as many times as before unrolling).  Don't do adjustment if profile
>      feedback is present.  */
>   if (new_est_niter < 5 && !profile_p)
>     {
>       if (est_niter < 5)
>         new_est_niter = est_niter;
>       else 
>         new_est_niter = 5;
>     } 
>
> This is not right when profile feedback is around and also when we
> managed to determine precise #of itrations at branch prediction time and
> did not cap.
>
> So I replaced it iwht the test that adjusted header count is not smaller
> than the preheader edge count.  However this will happily get loop
> iteration count close to 0.
>
> It is bit hard to figure out if profile is realistic:
>
> Sometimes we do
>    profile_status_for_fn (cfun) != PROFILE_READ
> I am trying to get rid of this test.  With LTO or when comdat profile is
> lost we inline together functions with and without profile.
>
> We can test for quality of loop header count to be precise or adjusted.
> However at the time vectorizer is modifying loop profile we already
> adjusted it for the initial conditional for profitability threshold and
> drop it to GUESSED.Even with profile feedback we do not know outcome
> probability of that one (Ondrej Kubanek's histograms will help here).

Ah, yeah, hadn't thought about that.

> So I think we want to check if we have loop iteration estimate recorded
> (that should be true for both profile feedback and loops with known trip
> count) and if so compare it what profile says and it is more or less in
> match consider profile realistic.  This needs to be done before
> vectorizer starts tampering with the loop.
>
> I will try to make patch for that.

Thanks!

Richard
Jan Hubicka July 21, 2023, 5:10 p.m. UTC | #6
Avoid scaling flat loop profiles of vectorized loops

As discussed, when vectorizing loop with static profile, it is not always good idea
to divide the header frequency by vectorization factor because the profile may
not realistically represent the expected number of iterations.  Since in such cases
we default to relatively low iteration counts (based on average for spec2k17), this
will make vectorized loop body look cold.

This patch makes vectorizer to look for flat profiles and only possibly reduce the
profile by known upper bound on iteration counts.

Bootstrapp/regtested of x86_64-linux in progress. I intend to commit this after
testers pick other profile related changes from today.
Tamar, Richard, it would be nice to know if it fixes the testcase you was looking at
and possibly turn it into a testcase?

gcc/ChangeLog:

	* tree-vect-loop.cc (scale_profile_for_vect_loop): Avoid scaling flat
	profiles by vectorization factor.
	(vect_transform_loop): Check for flat profiles.

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index b44fb9c7712..d036a7d4480 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10837,11 +10837,25 @@ vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
 }
 
 /* Scale profiling counters by estimation for LOOP which is vectorized
-   by factor VF.  */
+   by factor VF.
+   If FLAT is true, the loop we started with had unrealistically flat
+   profile.  */
 
 static void
-scale_profile_for_vect_loop (class loop *loop, unsigned vf)
+scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
 {
+  /* For flat profiles do not scale down proportionally by VF and only
+     cap by known iteration count bounds.  */
+  if (flat)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file,
+		 "Vectorized loop profile seems flat; not scaling iteration "
+		 "count down by the vectorization factor %i\n", vf);
+      scale_loop_profile (loop, profile_probability::always (),
+			  get_likely_max_loop_iterations_int (loop));
+      return;
+    }
   /* Loop body executes VF fewer times and exit increases VF times.  */
   edge exit_e = single_exit (loop);
   profile_count entry_count = loop_preheader_edge (loop)->count ();
@@ -10852,7 +10866,13 @@ scale_profile_for_vect_loop (class loop *loop, unsigned vf)
   while (vf > 1
 	 && loop->header->count > entry_count
 	 && loop->header->count < entry_count * vf)
-    vf /= 2;
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file,
+		 "Vectorization factor %i seems too large for profile "
+		 "prevoiusly believed to be consistent; reducing.\n", vf);
+      vf /= 2;
+    }
 
   if (entry_count.nonzero_p ())
     set_edge_probability_and_rescale_others
@@ -11184,6 +11204,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   gimple *stmt;
   bool check_profitability = false;
   unsigned int th;
+  bool flat = maybe_flat_loop_profile (loop);
 
   DUMP_VECT_SCOPE ("vec_transform_loop");
 
@@ -11252,7 +11273,6 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 			      &step_vector, &niters_vector_mult_vf, th,
 			      check_profitability, niters_no_overflow,
 			      &advance);
-
   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
@@ -11545,7 +11565,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 			  assumed_vf) - 1
 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
 			   assumed_vf) - 1);
-  scale_profile_for_vect_loop (loop, assumed_vf);
+  scale_profile_for_vect_loop (loop, assumed_vf, flat);
 
   if (dump_enabled_p ())
     {
Richard Sandiford July 21, 2023, 5:19 p.m. UTC | #7
Jan Hubicka <hubicka@ucw.cz> writes:
> Avoid scaling flat loop profiles of vectorized loops
>
> As discussed, when vectorizing loop with static profile, it is not always good idea
> to divide the header frequency by vectorization factor because the profile may
> not realistically represent the expected number of iterations.  Since in such cases
> we default to relatively low iteration counts (based on average for spec2k17), this
> will make vectorized loop body look cold.
>
> This patch makes vectorizer to look for flat profiles and only possibly reduce the
> profile by known upper bound on iteration counts.
>
> Bootstrapp/regtested of x86_64-linux in progress. I intend to commit this after
> testers pick other profile related changes from today.
> Tamar, Richard, it would be nice to know if it fixes the testcase you was looking at
> and possibly turn it into a testcase?

Yeah, it does!  Thanks for the quick fix.

The test was gcc.target/aarch64/sve/live_1.c.  Although it wasn't
originally a profile test, I think it should still be a relatively good
way of testing that the latch is treated as more likely than the exit,
without needing to check for that explicitly.

Richard

>
> gcc/ChangeLog:
>
> 	* tree-vect-loop.cc (scale_profile_for_vect_loop): Avoid scaling flat
> 	profiles by vectorization factor.
> 	(vect_transform_loop): Check for flat profiles.
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index b44fb9c7712..d036a7d4480 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10837,11 +10837,25 @@ vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
>  }
>  
>  /* Scale profiling counters by estimation for LOOP which is vectorized
> -   by factor VF.  */
> +   by factor VF.
> +   If FLAT is true, the loop we started with had unrealistically flat
> +   profile.  */
>  
>  static void
> -scale_profile_for_vect_loop (class loop *loop, unsigned vf)
> +scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
>  {
> +  /* For flat profiles do not scale down proportionally by VF and only
> +     cap by known iteration count bounds.  */
> +  if (flat)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file,
> +		 "Vectorized loop profile seems flat; not scaling iteration "
> +		 "count down by the vectorization factor %i\n", vf);
> +      scale_loop_profile (loop, profile_probability::always (),
> +			  get_likely_max_loop_iterations_int (loop));
> +      return;
> +    }
>    /* Loop body executes VF fewer times and exit increases VF times.  */
>    edge exit_e = single_exit (loop);
>    profile_count entry_count = loop_preheader_edge (loop)->count ();
> @@ -10852,7 +10866,13 @@ scale_profile_for_vect_loop (class loop *loop, unsigned vf)
>    while (vf > 1
>  	 && loop->header->count > entry_count
>  	 && loop->header->count < entry_count * vf)
> -    vf /= 2;
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file,
> +		 "Vectorization factor %i seems too large for profile "
> +		 "prevoiusly believed to be consistent; reducing.\n", vf);
> +      vf /= 2;
> +    }
>  
>    if (entry_count.nonzero_p ())
>      set_edge_probability_and_rescale_others
> @@ -11184,6 +11204,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
>    gimple *stmt;
>    bool check_profitability = false;
>    unsigned int th;
> +  bool flat = maybe_flat_loop_profile (loop);
>  
>    DUMP_VECT_SCOPE ("vec_transform_loop");
>  
> @@ -11252,7 +11273,6 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
>  			      &step_vector, &niters_vector_mult_vf, th,
>  			      check_profitability, niters_no_overflow,
>  			      &advance);
> -
>    if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
>        && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
>      scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
> @@ -11545,7 +11565,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
>  			  assumed_vf) - 1
>  	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
>  			   assumed_vf) - 1);
> -  scale_profile_for_vect_loop (loop, assumed_vf);
> +  scale_profile_for_vect_loop (loop, assumed_vf, flat);
>  
>    if (dump_enabled_p ())
>      {
diff mbox series

Patch

--- a/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/live_1.c
@@ -27,10 +27,10 @@ 
 
 TEST_ALL (EXTRACT_LAST)
 
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, } 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].h, } 4 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].s, } 4 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].d, } 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.b, } 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.h, } 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.s, } 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.d, } 4 } } */
 
 /* { dg-final { scan-assembler-times {\tlastb\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tlastb\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */