diff mbox series

zlib: Fix big performance regression

Message ID 9756b956da953de2c32278a81f711cf3ac281bad.1719476684.git.christophe.leroy@csgroup.eu
State Superseded
Delegated to: Tom Rini
Headers show
Series zlib: Fix big performance regression | expand

Commit Message

Christophe Leroy June 27, 2024, 8:25 a.m. UTC
Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
brings a big performance regression in inflate_fast(), which leads
to watchdog timer reset on powerpc 8xx.

It looks like that commit does more than what it describe, it
especially removed an important optimisation that was doing copies
using halfwords instead of bytes. That unexpected change multiplied
by almost 4 the time spent in inflate_fast() and increased by 40%
the overall time needed to uncompress linux kernel image.

So partially revert that commit but keep post incrementation as it
is the initial purpose of said commit.

Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 lib/zlib/inffast.c | 51 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 11 deletions(-)

Comments

Tom Rini June 27, 2024, 1:57 p.m. UTC | #1
On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:

> Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> brings a big performance regression in inflate_fast(), which leads
> to watchdog timer reset on powerpc 8xx.
> 
> It looks like that commit does more than what it describe, it
> especially removed an important optimisation that was doing copies
> using halfwords instead of bytes. That unexpected change multiplied
> by almost 4 the time spent in inflate_fast() and increased by 40%
> the overall time needed to uncompress linux kernel image.
> 
> So partially revert that commit but keep post incrementation as it
> is the initial purpose of said commit.
> 
> Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>

Good work.

Reviewed-by: Tom Rini <trini@konsulko.com>

And can you please head over to https://github.com/madler/zlib and file
an issue, or pull request with your changes explaining why? I would hope
they're interested in performance regressions on slower parts still.
Thanks!
Joakim Tjernlund June 27, 2024, 2:41 p.m. UTC | #2
On Thu, 2024-06-27 at 10:25 +0200, Christophe Leroy wrote:
> Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> brings a big performance regression in inflate_fast(), which leads
> to watchdog timer reset on powerpc 8xx.
> 
> It looks like that commit does more than what it describe, it
> especially removed an important optimisation that was doing copies
> using halfwords instead of bytes. That unexpected change multiplied
> by almost 4 the time spent in inflate_fast() and increased by 40%
> the overall time needed to uncompress linux kernel image.
> 
> So partially revert that commit but keep post incrementation as it
> is the initial purpose of said commit.
> 
> Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---

Thanks for looking after my old optimization :)

 Jocke
Tom Rini June 27, 2024, 7:34 p.m. UTC | #3
On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:

> Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> brings a big performance regression in inflate_fast(), which leads
> to watchdog timer reset on powerpc 8xx.
> 
> It looks like that commit does more than what it describe, it
> especially removed an important optimisation that was doing copies
> using halfwords instead of bytes. That unexpected change multiplied
> by almost 4 the time spent in inflate_fast() and increased by 40%
> the overall time needed to uncompress linux kernel image.
> 
> So partially revert that commit but keep post incrementation as it
> is the initial purpose of said commit.
> 
> Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  lib/zlib/inffast.c | 51 ++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 40 insertions(+), 11 deletions(-)

Both this, and my mostly revert lead to CI failures around compression
tests:
https://source.denx.de/u-boot/u-boot/-/jobs/859329

> 
> diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
> index 5e2a65ad4d..c4b1d12258 100644
> --- a/lib/zlib/inffast.c
> +++ b/lib/zlib/inffast.c
> @@ -236,18 +236,47 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
>                      }
>                  }
>                  else {
> +		    unsigned short *sout;
> +		    unsigned long loops;
> +
>                      from = out - dist;          /* copy direct from output */
> -                    do {                        /* minimum length is three */
> -                        *out++ = *from++;
> -                        *out++ = *from++;
> -                        *out++ = *from++;
> -                        len -= 3;
> -                    } while (len > 2);
> -                    if (len) {
> -                        *out++ = *from++;
> -                        if (len > 1)
> -                            *out++ = *from++;
> -                    }
> +                    /* minimum length is three */
> +		    /* Align out addr */
> +		    if (!((long)(out - 1) & 1)) {
> +			*out++ = *from++;
> +			len--;
> +		    }
> +		    sout = (unsigned short *)out;
> +		    if (dist > 2 ) {
> +			unsigned short *sfrom;
> +
> +			sfrom = (unsigned short *)from;
> +			loops = len >> 1;
> +			do
> +			    *sout++ = get_unaligned(++sfrom);
> +			while (--loops);
> +			out = (unsigned char *)sout;
> +			from = (unsigned char *)sfrom;
> +		    } else { /* dist == 1 or dist == 2 */
> +			unsigned short pat16;
> +
> +			pat16 = *(sout-2);
> +			if (dist == 1)
> +#if defined(__BIG_ENDIAN)
> +			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
> +#elif defined(__LITTLE_ENDIAN)
> +			    pat16 = (pat16 & 0xff00) | ((pat16 & 0xff00 ) >> 8);
> +#else
> +#error __BIG_ENDIAN nor __LITTLE_ENDIAN is defined
> +#endif
> +			loops = len >> 1;
> +			do
> +			    *sout++ = pat16;
> +			while (--loops);
> +			out = (unsigned char *)sout;
> +		    }
> +		    if (len & 1)
> +			*out++ = *from++;
>                  }
>              }
>              else if ((op & 64) == 0) {          /* 2nd level distance code */
> -- 
> 2.44.0
>
Tom Rini June 27, 2024, 7:40 p.m. UTC | #4
On Thu, Jun 27, 2024 at 01:34:55PM -0600, Tom Rini wrote:
> On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:
> 
> > Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> > brings a big performance regression in inflate_fast(), which leads
> > to watchdog timer reset on powerpc 8xx.
> > 
> > It looks like that commit does more than what it describe, it
> > especially removed an important optimisation that was doing copies
> > using halfwords instead of bytes. That unexpected change multiplied
> > by almost 4 the time spent in inflate_fast() and increased by 40%
> > the overall time needed to uncompress linux kernel image.
> > 
> > So partially revert that commit but keep post incrementation as it
> > is the initial purpose of said commit.
> > 
> > Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> > ---
> >  lib/zlib/inffast.c | 51 ++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 40 insertions(+), 11 deletions(-)
> 
> Both this, and my mostly revert lead to CI failures around compression
> tests:
> https://source.denx.de/u-boot/u-boot/-/jobs/859329

A full revert however, is fine.
Christophe Leroy June 30, 2024, 8:47 a.m. UTC | #5
Le 27/06/2024 à 21:40, Tom Rini a écrit :
> On Thu, Jun 27, 2024 at 01:34:55PM -0600, Tom Rini wrote:
>> On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:
>>
>>> Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
>>> brings a big performance regression in inflate_fast(), which leads
>>> to watchdog timer reset on powerpc 8xx.
>>>
>>> It looks like that commit does more than what it describe, it
>>> especially removed an important optimisation that was doing copies
>>> using halfwords instead of bytes. That unexpected change multiplied
>>> by almost 4 the time spent in inflate_fast() and increased by 40%
>>> the overall time needed to uncompress linux kernel image.
>>>
>>> So partially revert that commit but keep post incrementation as it
>>> is the initial purpose of said commit.
>>>
>>> Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>> ---
>>>   lib/zlib/inffast.c | 51 ++++++++++++++++++++++++++++++++++++----------
>>>   1 file changed, 40 insertions(+), 11 deletions(-)
>>
>> Both this, and my mostly revert lead to CI failures around compression
>> tests:
>> https://source.denx.de/u-boot/u-boot/-/jobs/859329
> 
> A full revert however, is fine.
> 

Is it that ? :

FAILED 
test/py/tests/test_ut.py::test_ut[ut_compression_compression_test_gzip]

It seems like when the optimisation was added by commit cd514aeb996e 
("zlib: Optimize decompression"), only the pre-increment implementation 
was available.

When POSTINC was added by commit e89516f031db ("zlib: split up to match 
original source tree"), I guess it was not verified because POSTINC is 
#undef by zlib.h.

With the following change I don't get the FAILED 
ut_compression_compression_test_gzip CI anymore 
(https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/jobs/859937)

diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
index c271d85ea1..5dc0574202 100644
--- a/lib/zlib/inffast.c
+++ b/lib/zlib/inffast.c
@@ -257,7 +257,7 @@ void inflate_fast(z_streamp strm, unsigned start)
                         sfrom = (unsigned short *)(from - OFF);
                         loops = len >> 1;
                         do
-                           PUP(sout) = get_unaligned(++sfrom);
+                           PUP(sout) = get_unaligned(sfrom++);
                         while (--loops);
                         out = (unsigned char *)sout + OFF;
                         from = (unsigned char *)sfrom + OFF;


Christophe
Tom Rini June 30, 2024, 2:54 p.m. UTC | #6
On Sun, Jun 30, 2024 at 10:47:06AM +0200, Christophe Leroy wrote:
> 
> 
> Le 27/06/2024 à 21:40, Tom Rini a écrit :
> > On Thu, Jun 27, 2024 at 01:34:55PM -0600, Tom Rini wrote:
> > > On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:
> > > 
> > > > Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> > > > brings a big performance regression in inflate_fast(), which leads
> > > > to watchdog timer reset on powerpc 8xx.
> > > > 
> > > > It looks like that commit does more than what it describe, it
> > > > especially removed an important optimisation that was doing copies
> > > > using halfwords instead of bytes. That unexpected change multiplied
> > > > by almost 4 the time spent in inflate_fast() and increased by 40%
> > > > the overall time needed to uncompress linux kernel image.
> > > > 
> > > > So partially revert that commit but keep post incrementation as it
> > > > is the initial purpose of said commit.
> > > > 
> > > > Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> > > > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> > > > ---
> > > >   lib/zlib/inffast.c | 51 ++++++++++++++++++++++++++++++++++++----------
> > > >   1 file changed, 40 insertions(+), 11 deletions(-)
> > > 
> > > Both this, and my mostly revert lead to CI failures around compression
> > > tests:
> > > https://source.denx.de/u-boot/u-boot/-/jobs/859329
> > 
> > A full revert however, is fine.
> > 
> 
> Is it that ? :
> 
> FAILED
> test/py/tests/test_ut.py::test_ut[ut_compression_compression_test_gzip]
> 
> It seems like when the optimisation was added by commit cd514aeb996e ("zlib:
> Optimize decompression"), only the pre-increment implementation was
> available.
> 
> When POSTINC was added by commit e89516f031db ("zlib: split up to match
> original source tree"), I guess it was not verified because POSTINC is
> #undef by zlib.h.
> 
> With the following change I don't get the FAILED
> ut_compression_compression_test_gzip CI anymore
> (https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/jobs/859937)
> 
> diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
> index c271d85ea1..5dc0574202 100644
> --- a/lib/zlib/inffast.c
> +++ b/lib/zlib/inffast.c
> @@ -257,7 +257,7 @@ void inflate_fast(z_streamp strm, unsigned start)
>                         sfrom = (unsigned short *)(from - OFF);
>                         loops = len >> 1;
>                         do
> -                           PUP(sout) = get_unaligned(++sfrom);
> +                           PUP(sout) = get_unaligned(sfrom++);
>                         while (--loops);
>                         out = (unsigned char *)sout + OFF;
>                         from = (unsigned char *)sfrom + OFF;

Ah, thanks! Testing again now.
Christophe Leroy June 30, 2024, 6:28 p.m. UTC | #7
Le 30/06/2024 à 16:54, Tom Rini a écrit :
> On Sun, Jun 30, 2024 at 10:47:06AM +0200, Christophe Leroy wrote:
>>
>>
>> Le 27/06/2024 à 21:40, Tom Rini a écrit :
>>> On Thu, Jun 27, 2024 at 01:34:55PM -0600, Tom Rini wrote:
>>>> On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:
>>>>
>>>>> Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
>>>>> brings a big performance regression in inflate_fast(), which leads
>>>>> to watchdog timer reset on powerpc 8xx.
>>>>>
>>>>> It looks like that commit does more than what it describe, it
>>>>> especially removed an important optimisation that was doing copies
>>>>> using halfwords instead of bytes. That unexpected change multiplied
>>>>> by almost 4 the time spent in inflate_fast() and increased by 40%
>>>>> the overall time needed to uncompress linux kernel image.
>>>>>
>>>>> So partially revert that commit but keep post incrementation as it
>>>>> is the initial purpose of said commit.
>>>>>
>>>>> Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
>>>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>>>> ---
>>>>>    lib/zlib/inffast.c | 51 ++++++++++++++++++++++++++++++++++++----------
>>>>>    1 file changed, 40 insertions(+), 11 deletions(-)
>>>>
>>>> Both this, and my mostly revert lead to CI failures around compression
>>>> tests:
>>>> https://source.denx.de/u-boot/u-boot/-/jobs/859329
>>>
>>> A full revert however, is fine.
>>>
>>
>> Is it that ? :
>>
>> FAILED
>> test/py/tests/test_ut.py::test_ut[ut_compression_compression_test_gzip]
>>
>> It seems like when the optimisation was added by commit cd514aeb996e ("zlib:
>> Optimize decompression"), only the pre-increment implementation was
>> available.
>>
>> When POSTINC was added by commit e89516f031db ("zlib: split up to match
>> original source tree"), I guess it was not verified because POSTINC is
>> #undef by zlib.h.
>>
>> With the following change I don't get the FAILED
>> ut_compression_compression_test_gzip CI anymore
>> (https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/jobs/859937)
>>
>> diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
>> index c271d85ea1..5dc0574202 100644
>> --- a/lib/zlib/inffast.c
>> +++ b/lib/zlib/inffast.c
>> @@ -257,7 +257,7 @@ void inflate_fast(z_streamp strm, unsigned start)
>>                          sfrom = (unsigned short *)(from - OFF);
>>                          loops = len >> 1;
>>                          do
>> -                           PUP(sout) = get_unaligned(++sfrom);
>> +                           PUP(sout) = get_unaligned(sfrom++);
>>                          while (--loops);
>>                          out = (unsigned char *)sout + OFF;
>>                          from = (unsigned char *)sfrom + OFF;
> 
> Ah, thanks! Testing again now.
> 

That's probably not enough. I thought other failures were unrelated but 
I gave it a try with a full revert and I get no failure at all with that 
so there must be other things.

I find that pat16 = *(sout-2+2*OFF) suspicious, sout being a short it 
means -4 bytes which is not what is expected I think.
Christophe Leroy June 30, 2024, 6:49 p.m. UTC | #8
Le 30/06/2024 à 20:28, Christophe Leroy a écrit :
> 
> 
> Le 30/06/2024 à 16:54, Tom Rini a écrit :
>> On Sun, Jun 30, 2024 at 10:47:06AM +0200, Christophe Leroy wrote:
>>>
>>>
>>> Le 27/06/2024 à 21:40, Tom Rini a écrit :
>>>> On Thu, Jun 27, 2024 at 01:34:55PM -0600, Tom Rini wrote:
>>>>> On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:
>>>>>
>>>>>> Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
>>>>>> brings a big performance regression in inflate_fast(), which leads
>>>>>> to watchdog timer reset on powerpc 8xx.
>>>>>>
>>>>>> It looks like that commit does more than what it describe, it
>>>>>> especially removed an important optimisation that was doing copies
>>>>>> using halfwords instead of bytes. That unexpected change multiplied
>>>>>> by almost 4 the time spent in inflate_fast() and increased by 40%
>>>>>> the overall time needed to uncompress linux kernel image.
>>>>>>
>>>>>> So partially revert that commit but keep post incrementation as it
>>>>>> is the initial purpose of said commit.
>>>>>>
>>>>>> Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
>>>>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>>>>> ---
>>>>>>    lib/zlib/inffast.c | 51 
>>>>>> ++++++++++++++++++++++++++++++++++++----------
>>>>>>    1 file changed, 40 insertions(+), 11 deletions(-)
>>>>>
>>>>> Both this, and my mostly revert lead to CI failures around compression
>>>>> tests:
>>>>> https://source.denx.de/u-boot/u-boot/-/jobs/859329
>>>>
>>>> A full revert however, is fine.
>>>>
>>>
>>> Is it that ? :
>>>
>>> FAILED
>>> test/py/tests/test_ut.py::test_ut[ut_compression_compression_test_gzip]
>>>
>>> It seems like when the optimisation was added by commit cd514aeb996e 
>>> ("zlib:
>>> Optimize decompression"), only the pre-increment implementation was
>>> available.
>>>
>>> When POSTINC was added by commit e89516f031db ("zlib: split up to match
>>> original source tree"), I guess it was not verified because POSTINC is
>>> #undef by zlib.h.
>>>
>>> With the following change I don't get the FAILED
>>> ut_compression_compression_test_gzip CI anymore
>>> (https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/jobs/859937)
>>>
>>> diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
>>> index c271d85ea1..5dc0574202 100644
>>> --- a/lib/zlib/inffast.c
>>> +++ b/lib/zlib/inffast.c
>>> @@ -257,7 +257,7 @@ void inflate_fast(z_streamp strm, unsigned start)
>>>                          sfrom = (unsigned short *)(from - OFF);
>>>                          loops = len >> 1;
>>>                          do
>>> -                           PUP(sout) = get_unaligned(++sfrom);
>>> +                           PUP(sout) = get_unaligned(sfrom++);
>>>                          while (--loops);
>>>                          out = (unsigned char *)sout + OFF;
>>>                          from = (unsigned char *)sfrom + OFF;
>>
>> Ah, thanks! Testing again now.
>>
> 
> That's probably not enough. I thought other failures were unrelated but 
> I gave it a try with a full revert and I get no failure at all with that 
> so there must be other things.
> 
> I find that pat16 = *(sout-2+2*OFF) suspicious, sout being a short it 
> means -4 bytes which is not what is expected I think.

That's it. I get all green with the following, see 
https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/pipelines/21391

diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
index c271d85ea1..69268caee8 100644
--- a/lib/zlib/inffast.c
+++ b/lib/zlib/inffast.c
@@ -257,14 +257,14 @@ void inflate_fast(z_streamp strm, unsigned start)
  			sfrom = (unsigned short *)(from - OFF);
  			loops = len >> 1;
  			do
-			    PUP(sout) = get_unaligned(++sfrom);
+			    PUP(sout) = get_unaligned(sfrom++);
  			while (--loops);
  			out = (unsigned char *)sout + OFF;
  			from = (unsigned char *)sfrom + OFF;
  		    } else { /* dist == 1 or dist == 2 */
  			unsigned short pat16;

-			pat16 = *(sout-2+2*OFF);
+			pat16 = *(sout-1+OFF);
  			if (dist == 1)
  #if defined(__BIG_ENDIAN)
  			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);

Christophe
Tom Rini July 1, 2024, 1:06 a.m. UTC | #9
On Sun, Jun 30, 2024 at 08:49:08PM +0200, Christophe Leroy wrote:
> 
> 
> Le 30/06/2024 à 20:28, Christophe Leroy a écrit :
> > 
> > 
> > Le 30/06/2024 à 16:54, Tom Rini a écrit :
> > > On Sun, Jun 30, 2024 at 10:47:06AM +0200, Christophe Leroy wrote:
> > > > 
> > > > 
> > > > Le 27/06/2024 à 21:40, Tom Rini a écrit :
> > > > > On Thu, Jun 27, 2024 at 01:34:55PM -0600, Tom Rini wrote:
> > > > > > On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:
> > > > > > 
> > > > > > > Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> > > > > > > brings a big performance regression in inflate_fast(), which leads
> > > > > > > to watchdog timer reset on powerpc 8xx.
> > > > > > > 
> > > > > > > It looks like that commit does more than what it describe, it
> > > > > > > especially removed an important optimisation that was doing copies
> > > > > > > using halfwords instead of bytes. That unexpected change multiplied
> > > > > > > by almost 4 the time spent in inflate_fast() and increased by 40%
> > > > > > > the overall time needed to uncompress linux kernel image.
> > > > > > > 
> > > > > > > So partially revert that commit but keep post incrementation as it
> > > > > > > is the initial purpose of said commit.
> > > > > > > 
> > > > > > > Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> > > > > > > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> > > > > > > ---
> > > > > > >    lib/zlib/inffast.c | 51
> > > > > > > ++++++++++++++++++++++++++++++++++++----------
> > > > > > >    1 file changed, 40 insertions(+), 11 deletions(-)
> > > > > > 
> > > > > > Both this, and my mostly revert lead to CI failures around compression
> > > > > > tests:
> > > > > > https://source.denx.de/u-boot/u-boot/-/jobs/859329
> > > > > 
> > > > > A full revert however, is fine.
> > > > > 
> > > > 
> > > > Is it that ? :
> > > > 
> > > > FAILED
> > > > test/py/tests/test_ut.py::test_ut[ut_compression_compression_test_gzip]
> > > > 
> > > > It seems like when the optimisation was added by commit
> > > > cd514aeb996e ("zlib:
> > > > Optimize decompression"), only the pre-increment implementation was
> > > > available.
> > > > 
> > > > When POSTINC was added by commit e89516f031db ("zlib: split up to match
> > > > original source tree"), I guess it was not verified because POSTINC is
> > > > #undef by zlib.h.
> > > > 
> > > > With the following change I don't get the FAILED
> > > > ut_compression_compression_test_gzip CI anymore
> > > > (https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/jobs/859937)
> > > > 
> > > > diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
> > > > index c271d85ea1..5dc0574202 100644
> > > > --- a/lib/zlib/inffast.c
> > > > +++ b/lib/zlib/inffast.c
> > > > @@ -257,7 +257,7 @@ void inflate_fast(z_streamp strm, unsigned start)
> > > >                          sfrom = (unsigned short *)(from - OFF);
> > > >                          loops = len >> 1;
> > > >                          do
> > > > -                           PUP(sout) = get_unaligned(++sfrom);
> > > > +                           PUP(sout) = get_unaligned(sfrom++);
> > > >                          while (--loops);
> > > >                          out = (unsigned char *)sout + OFF;
> > > >                          from = (unsigned char *)sfrom + OFF;
> > > 
> > > Ah, thanks! Testing again now.
> > > 
> > 
> > That's probably not enough. I thought other failures were unrelated but
> > I gave it a try with a full revert and I get no failure at all with that
> > so there must be other things.
> > 
> > I find that pat16 = *(sout-2+2*OFF) suspicious, sout being a short it
> > means -4 bytes which is not what is expected I think.
> 
> That's it. I get all green with the following, see
> https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/pipelines/21391
> 
> diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
> index c271d85ea1..69268caee8 100644
> --- a/lib/zlib/inffast.c
> +++ b/lib/zlib/inffast.c
> @@ -257,14 +257,14 @@ void inflate_fast(z_streamp strm, unsigned start)
>  			sfrom = (unsigned short *)(from - OFF);
>  			loops = len >> 1;
>  			do
> -			    PUP(sout) = get_unaligned(++sfrom);
> +			    PUP(sout) = get_unaligned(sfrom++);
>  			while (--loops);
>  			out = (unsigned char *)sout + OFF;
>  			from = (unsigned char *)sfrom + OFF;
>  		    } else { /* dist == 1 or dist == 2 */
>  			unsigned short pat16;
> 
> -			pat16 = *(sout-2+2*OFF);
> +			pat16 = *(sout-1+OFF);
>  			if (dist == 1)
>  #if defined(__BIG_ENDIAN)
>  			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
> 

OK, so at this point we're changing a lot of common paths, to things
that haven't been tested before. I suspect it's all fine, but for the
release on July 1 I'm going to revert the zlib update, and then we can
perform these corrections and re-introduce the optimization after
release, when they'll get the most amount of testing.
Tom Rini July 5, 2024, 3:47 p.m. UTC | #10
On Sun, Jun 30, 2024 at 08:49:08PM +0200, Christophe Leroy wrote:
> 
> 
> Le 30/06/2024 à 20:28, Christophe Leroy a écrit :
> > 
> > 
> > Le 30/06/2024 à 16:54, Tom Rini a écrit :
> > > On Sun, Jun 30, 2024 at 10:47:06AM +0200, Christophe Leroy wrote:
> > > > 
> > > > 
> > > > Le 27/06/2024 à 21:40, Tom Rini a écrit :
> > > > > On Thu, Jun 27, 2024 at 01:34:55PM -0600, Tom Rini wrote:
> > > > > > On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:
> > > > > > 
> > > > > > > Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> > > > > > > brings a big performance regression in inflate_fast(), which leads
> > > > > > > to watchdog timer reset on powerpc 8xx.
> > > > > > > 
> > > > > > > It looks like that commit does more than what it describe, it
> > > > > > > especially removed an important optimisation that was doing copies
> > > > > > > using halfwords instead of bytes. That unexpected change multiplied
> > > > > > > by almost 4 the time spent in inflate_fast() and increased by 40%
> > > > > > > the overall time needed to uncompress linux kernel image.
> > > > > > > 
> > > > > > > So partially revert that commit but keep post incrementation as it
> > > > > > > is the initial purpose of said commit.
> > > > > > > 
> > > > > > > Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
> > > > > > > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> > > > > > > ---
> > > > > > >    lib/zlib/inffast.c | 51
> > > > > > > ++++++++++++++++++++++++++++++++++++----------
> > > > > > >    1 file changed, 40 insertions(+), 11 deletions(-)
> > > > > > 
> > > > > > Both this, and my mostly revert lead to CI failures around compression
> > > > > > tests:
> > > > > > https://source.denx.de/u-boot/u-boot/-/jobs/859329
> > > > > 
> > > > > A full revert however, is fine.
> > > > > 
> > > > 
> > > > Is it that ? :
> > > > 
> > > > FAILED
> > > > test/py/tests/test_ut.py::test_ut[ut_compression_compression_test_gzip]
> > > > 
> > > > It seems like when the optimisation was added by commit
> > > > cd514aeb996e ("zlib:
> > > > Optimize decompression"), only the pre-increment implementation was
> > > > available.
> > > > 
> > > > When POSTINC was added by commit e89516f031db ("zlib: split up to match
> > > > original source tree"), I guess it was not verified because POSTINC is
> > > > #undef by zlib.h.
> > > > 
> > > > With the following change I don't get the FAILED
> > > > ut_compression_compression_test_gzip CI anymore
> > > > (https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/jobs/859937)
> > > > 
> > > > diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
> > > > index c271d85ea1..5dc0574202 100644
> > > > --- a/lib/zlib/inffast.c
> > > > +++ b/lib/zlib/inffast.c
> > > > @@ -257,7 +257,7 @@ void inflate_fast(z_streamp strm, unsigned start)
> > > >                          sfrom = (unsigned short *)(from - OFF);
> > > >                          loops = len >> 1;
> > > >                          do
> > > > -                           PUP(sout) = get_unaligned(++sfrom);
> > > > +                           PUP(sout) = get_unaligned(sfrom++);
> > > >                          while (--loops);
> > > >                          out = (unsigned char *)sout + OFF;
> > > >                          from = (unsigned char *)sfrom + OFF;
> > > 
> > > Ah, thanks! Testing again now.
> > > 
> > 
> > That's probably not enough. I thought other failures were unrelated but
> > I gave it a try with a full revert and I get no failure at all with that
> > so there must be other things.
> > 
> > I find that pat16 = *(sout-2+2*OFF) suspicious, sout being a short it
> > means -4 bytes which is not what is expected I think.
> 
> That's it. I get all green with the following, see
> https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/pipelines/21391
> 
> diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
> index c271d85ea1..69268caee8 100644
> --- a/lib/zlib/inffast.c
> +++ b/lib/zlib/inffast.c
> @@ -257,14 +257,14 @@ void inflate_fast(z_streamp strm, unsigned start)
>  			sfrom = (unsigned short *)(from - OFF);
>  			loops = len >> 1;
>  			do
> -			    PUP(sout) = get_unaligned(++sfrom);
> +			    PUP(sout) = get_unaligned(sfrom++);
>  			while (--loops);
>  			out = (unsigned char *)sout + OFF;
>  			from = (unsigned char *)sfrom + OFF;
>  		    } else { /* dist == 1 or dist == 2 */
>  			unsigned short pat16;
> 
> -			pat16 = *(sout-2+2*OFF);
> +			pat16 = *(sout-1+OFF);
>  			if (dist == 1)
>  #if defined(__BIG_ENDIAN)
>  			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);

Now that the release is out, can you please do a new patch to
re-introduce the optimization required on your platform? A revert of
"bbacdd3ef7762fbdeab43ceea5205d1fd0f25bbd" is required first (and
separate) and I'll push that shortly. If you don't have time currently,
I can try and v2 your original patch with these changes included, thanks!
Christophe Leroy July 8, 2024, 6:44 a.m. UTC | #11
Le 05/07/2024 à 17:47, Tom Rini a écrit :
> On Sun, Jun 30, 2024 at 08:49:08PM +0200, Christophe Leroy wrote:
>>
>>
>> Le 30/06/2024 à 20:28, Christophe Leroy a écrit :
>>>
>>>
>>> Le 30/06/2024 à 16:54, Tom Rini a écrit :
>>>> On Sun, Jun 30, 2024 at 10:47:06AM +0200, Christophe Leroy wrote:
>>>>>
>>>>>
>>>>> Le 27/06/2024 à 21:40, Tom Rini a écrit :
>>>>>> On Thu, Jun 27, 2024 at 01:34:55PM -0600, Tom Rini wrote:
>>>>>>> On Thu, Jun 27, 2024 at 10:25:21AM +0200, Christophe Leroy wrote:
>>>>>>>
>>>>>>>> Commit 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
>>>>>>>> brings a big performance regression in inflate_fast(), which leads
>>>>>>>> to watchdog timer reset on powerpc 8xx.
>>>>>>>>
>>>>>>>> It looks like that commit does more than what it describe, it
>>>>>>>> especially removed an important optimisation that was doing copies
>>>>>>>> using halfwords instead of bytes. That unexpected change multiplied
>>>>>>>> by almost 4 the time spent in inflate_fast() and increased by 40%
>>>>>>>> the overall time needed to uncompress linux kernel image.
>>>>>>>>
>>>>>>>> So partially revert that commit but keep post incrementation as it
>>>>>>>> is the initial purpose of said commit.
>>>>>>>>
>>>>>>>> Fixes: 340fdf1303dc ("zlib: Port fix for CVE-2016-9841 to U-Boot")
>>>>>>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>>>>>>> ---
>>>>>>>>     lib/zlib/inffast.c | 51
>>>>>>>> ++++++++++++++++++++++++++++++++++++----------
>>>>>>>>     1 file changed, 40 insertions(+), 11 deletions(-)
>>>>>>>
>>>>>>> Both this, and my mostly revert lead to CI failures around compression
>>>>>>> tests:
>>>>>>> https://source.denx.de/u-boot/u-boot/-/jobs/859329
>>>>>>
>>>>>> A full revert however, is fine.
>>>>>>
>>>>>
>>>>> Is it that ? :
>>>>>
>>>>> FAILED
>>>>> test/py/tests/test_ut.py::test_ut[ut_compression_compression_test_gzip]
>>>>>
>>>>> It seems like when the optimisation was added by commit
>>>>> cd514aeb996e ("zlib:
>>>>> Optimize decompression"), only the pre-increment implementation was
>>>>> available.
>>>>>
>>>>> When POSTINC was added by commit e89516f031db ("zlib: split up to match
>>>>> original source tree"), I guess it was not verified because POSTINC is
>>>>> #undef by zlib.h.
>>>>>
>>>>> With the following change I don't get the FAILED
>>>>> ut_compression_compression_test_gzip CI anymore
>>>>> (https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/jobs/859937)
>>>>>
>>>>> diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
>>>>> index c271d85ea1..5dc0574202 100644
>>>>> --- a/lib/zlib/inffast.c
>>>>> +++ b/lib/zlib/inffast.c
>>>>> @@ -257,7 +257,7 @@ void inflate_fast(z_streamp strm, unsigned start)
>>>>>                           sfrom = (unsigned short *)(from - OFF);
>>>>>                           loops = len >> 1;
>>>>>                           do
>>>>> -                           PUP(sout) = get_unaligned(++sfrom);
>>>>> +                           PUP(sout) = get_unaligned(sfrom++);
>>>>>                           while (--loops);
>>>>>                           out = (unsigned char *)sout + OFF;
>>>>>                           from = (unsigned char *)sfrom + OFF;
>>>>
>>>> Ah, thanks! Testing again now.
>>>>
>>>
>>> That's probably not enough. I thought other failures were unrelated but
>>> I gave it a try with a full revert and I get no failure at all with that
>>> so there must be other things.
>>>
>>> I find that pat16 = *(sout-2+2*OFF) suspicious, sout being a short it
>>> means -4 bytes which is not what is expected I think.
>>
>> That's it. I get all green with the following, see
>> https://source.denx.de/u-boot/custodians/u-boot-mpc8xx/-/pipelines/21391
>>
>> diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
>> index c271d85ea1..69268caee8 100644
>> --- a/lib/zlib/inffast.c
>> +++ b/lib/zlib/inffast.c
>> @@ -257,14 +257,14 @@ void inflate_fast(z_streamp strm, unsigned start)
>>   			sfrom = (unsigned short *)(from - OFF);
>>   			loops = len >> 1;
>>   			do
>> -			    PUP(sout) = get_unaligned(++sfrom);
>> +			    PUP(sout) = get_unaligned(sfrom++);
>>   			while (--loops);
>>   			out = (unsigned char *)sout + OFF;
>>   			from = (unsigned char *)sfrom + OFF;
>>   		    } else { /* dist == 1 or dist == 2 */
>>   			unsigned short pat16;
>>
>> -			pat16 = *(sout-2+2*OFF);
>> +			pat16 = *(sout-1+OFF);
>>   			if (dist == 1)
>>   #if defined(__BIG_ENDIAN)
>>   			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
> 
> Now that the release is out, can you please do a new patch to
> re-introduce the optimization required on your platform? A revert of
> "bbacdd3ef7762fbdeab43ceea5205d1fd0f25bbd" is required first (and
> separate) and I'll push that shortly. If you don't have time currently,
> I can try and v2 your original patch with these changes included, thanks!
> 

I will try to send something tomorrow.
diff mbox series

Patch

diff --git a/lib/zlib/inffast.c b/lib/zlib/inffast.c
index 5e2a65ad4d..c4b1d12258 100644
--- a/lib/zlib/inffast.c
+++ b/lib/zlib/inffast.c
@@ -236,18 +236,47 @@  unsigned start;         /* inflate()'s starting value for strm->avail_out */
                     }
                 }
                 else {
+		    unsigned short *sout;
+		    unsigned long loops;
+
                     from = out - dist;          /* copy direct from output */
-                    do {                        /* minimum length is three */
-                        *out++ = *from++;
-                        *out++ = *from++;
-                        *out++ = *from++;
-                        len -= 3;
-                    } while (len > 2);
-                    if (len) {
-                        *out++ = *from++;
-                        if (len > 1)
-                            *out++ = *from++;
-                    }
+                    /* minimum length is three */
+		    /* Align out addr */
+		    if (!((long)(out - 1) & 1)) {
+			*out++ = *from++;
+			len--;
+		    }
+		    sout = (unsigned short *)out;
+		    if (dist > 2 ) {
+			unsigned short *sfrom;
+
+			sfrom = (unsigned short *)from;
+			loops = len >> 1;
+			do
+			    *sout++ = get_unaligned(++sfrom);
+			while (--loops);
+			out = (unsigned char *)sout;
+			from = (unsigned char *)sfrom;
+		    } else { /* dist == 1 or dist == 2 */
+			unsigned short pat16;
+
+			pat16 = *(sout-2);
+			if (dist == 1)
+#if defined(__BIG_ENDIAN)
+			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
+#elif defined(__LITTLE_ENDIAN)
+			    pat16 = (pat16 & 0xff00) | ((pat16 & 0xff00 ) >> 8);
+#else
+#error __BIG_ENDIAN nor __LITTLE_ENDIAN is defined
+#endif
+			loops = len >> 1;
+			do
+			    *sout++ = pat16;
+			while (--loops);
+			out = (unsigned char *)sout;
+		    }
+		    if (len & 1)
+			*out++ = *from++;
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */