Message ID | 20220226070551.9833-3-jnhuang95@gmail.com |
---|---|
State | Accepted |
Commit | 26c7fdadcb7b829bc033ec8d0148385dd407f67b |
Delegated to: | Tom Rini |
Headers | show |
Series | fs/erofs: new filesystem | expand |
On Sat, Feb 26, 2022 at 03:05:48PM +0800, Huang Jianan wrote: > Update the LZ4 compression module based on LZ4 v1.8.3 in order to > use the newest LZ4_decompress_safe_partial() which can now decode > exactly the nb of bytes requested. > > Signed-off-by: Huang Jianan <jnhuang95@gmail.com> Applied to u-boot/next, thanks!
Hi Jianan, On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuang95@gmail.com> wrote: > > Update the LZ4 compression module based on LZ4 v1.8.3 in order to > use the newest LZ4_decompress_safe_partial() which can now decode > exactly the nb of bytes requested. > > Signed-off-by: Huang Jianan <jnhuang95@gmail.com> I noticed after this commit LZ4 decompression is slower. ulz4fn function call takes 1.209670 seconds with this commit. After reverting this commit, the ulz4fn function call takes 0.587032 seconds. I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4 using -9 option for maximum compression) on RK3399. Any ideas why it is slower with this commit and how the performance regression can be fixed? Thanks. Regards, Jonathan
Hi, On 2024/5/24 22:26, Jonathan Liu wrote: > Hi Jianan, > > On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuang95@gmail.com> wrote: >> >> Update the LZ4 compression module based on LZ4 v1.8.3 in order to >> use the newest LZ4_decompress_safe_partial() which can now decode >> exactly the nb of bytes requested. >> >> Signed-off-by: Huang Jianan <jnhuang95@gmail.com> > > I noticed after this commit LZ4 decompression is slower. > ulz4fn function call takes 1.209670 seconds with this commit. > After reverting this commit, the ulz4fn function call takes 0.587032 seconds. > > I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4 > using -9 option for maximum compression) on RK3399. > > Any ideas why it is slower with this commit and how the performance > regression can be fixed? Just the quick glance, I think the issue may be due to memcpy/memmove since it seems the main difference between these two codebases (I'm not sure which LZ4 version the old codebase was based on) and the new version mainly relies on memcpy/memmove instead of its own versions. Would you mind to check the assembly how memcpy/memset is generated on your platform? Thanks, Gao Xiang > > Thanks. > > Regards, > Jonathan
Hi Gao, On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiangkao@linux.alibaba.com> wrote: > > Hi, > > On 2024/5/24 22:26, Jonathan Liu wrote: > > Hi Jianan, > > > > On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuang95@gmail.com> wrote: > >> > >> Update the LZ4 compression module based on LZ4 v1.8.3 in order to > >> use the newest LZ4_decompress_safe_partial() which can now decode > >> exactly the nb of bytes requested. > >> > >> Signed-off-by: Huang Jianan <jnhuang95@gmail.com> > > > > I noticed after this commit LZ4 decompression is slower. > > ulz4fn function call takes 1.209670 seconds with this commit. > > After reverting this commit, the ulz4fn function call takes 0.587032 seconds. > > > > I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4 > > using -9 option for maximum compression) on RK3399. > > > > Any ideas why it is slower with this commit and how the performance > > regression can be fixed? > > Just the quick glance, I think the issue may be due to memcpy/memmove > since it seems the main difference between these two codebases > (I'm not sure which LZ4 version the old codebase was based on) and > the new version mainly relies on memcpy/memmove instead of its own > versions. > > Would you mind to check the assembly how memcpy/memset is generated > on your platform? Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto): 000000000028220c <memset>: #if !CONFIG_IS_ENABLED(TINY_MEMSET) unsigned long cl = 0; int i; /* do it one word at a time (32 bits or 64 bits) while possible */ if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) { 28220c: f2400803 ands x3, x0, #0x7 282210: 540002c1 b.ne 282268 <memset+0x5c> // b.any for (i = 0; i < sizeof(*sl); i++) { cl <<= 8; cl |= c & 0xff; 282214: 92401c26 and x6, x1, #0xff unsigned long cl = 0; 282218: d2800004 mov x4, #0x0 // #0 28221c: 52800105 mov w5, #0x8 // #8 cl |= c & 0xff; 282220: aa0420c4 orr x4, x6, x4, lsl #8 for (i = 0; i < sizeof(*sl); i++) { 282224: 710004a5 subs w5, w5, #0x1 282228: 54ffffc1 b.ne 282220 <memset+0x14> // b.any } while (count >= sizeof(*sl)) { 28222c: cb030045 sub x5, x2, x3 282230: f1001cbf cmp x5, #0x7 282234: 54000148 b.hi 28225c <memset+0x50> // b.pmore 282238: d343fc43 lsr x3, x2, #3 28223c: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 282240: 9b047c63 mul x3, x3, x4 282244: 8b030042 add x2, x2, x3 282248: cb030003 sub x3, x0, x3 unsigned long *sl = (unsigned long *) s; 28224c: d2800004 mov x4, #0x0 // #0 count -= sizeof(*sl); } } #endif /* fill 8 bits at a time */ s8 = (char *)sl; while (count--) 282250: eb04005f cmp x2, x4 282254: 540000e1 b.ne 282270 <memset+0x64> // b.any *s8++ = c; return s; } 282258: d65f03c0 ret *sl++ = cl; 28225c: f8236804 str x4, [x0, x3] count -= sizeof(*sl); 282260: 91002063 add x3, x3, #0x8 282264: 17fffff2 b 28222c <memset+0x20> unsigned long *sl = (unsigned long *) s; 282268: aa0003e3 mov x3, x0 28226c: 17fffff8 b 28224c <memset+0x40> *s8++ = c; 282270: 38246861 strb w1, [x3, x4] 282274: 91000484 add x4, x4, #0x1 282278: 17fffff6 b 282250 <memset+0x44> 000000000028227c <memcpy>: __used void * memcpy(void *dest, const void *src, size_t count) { unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src; char *d8, *s8; if (src == dest) 28227c: eb01001f cmp x0, x1 282280: 54000100 b.eq 2822a0 <memcpy+0x24> // b.none return dest; /* while all data is aligned (common case), copy a word at a time */ if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) { 282284: aa010003 orr x3, x0, x1 282288: f2400863 ands x3, x3, #0x7 28228c: 54000120 b.eq 2822b0 <memcpy+0x34> // b.none 282290: aa0003e4 mov x4, x0 282294: d2800003 mov x3, #0x0 // #0 } } /* copy the reset one byte at a time */ d8 = (char *)dl; s8 = (char *)sl; while (count--) 282298: eb03005f cmp x2, x3 28229c: 540001e1 b.ne 2822d8 <memcpy+0x5c> // b.any *d8++ = *s8++; return dest; } 2822a0: d65f03c0 ret *dl++ = *sl++; 2822a4: f8636824 ldr x4, [x1, x3] 2822a8: f8236804 str x4, [x0, x3] count -= sizeof(*dl); 2822ac: 91002063 add x3, x3, #0x8 while (count >= sizeof(*dl)) { 2822b0: cb030044 sub x4, x2, x3 2822b4: f1001c9f cmp x4, #0x7 2822b8: 54ffff68 b.hi 2822a4 <memcpy+0x28> // b.pmore 2822bc: d343fc43 lsr x3, x2, #3 2822c0: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 2822c4: 9b047c63 mul x3, x3, x4 2822c8: 8b030042 add x2, x2, x3 2822cc: cb030004 sub x4, x0, x3 2822d0: cb030021 sub x1, x1, x3 2822d4: 17fffff0 b 282294 <memcpy+0x18> *d8++ = *s8++; 2822d8: 38636825 ldrb w5, [x1, x3] 2822dc: 38236885 strb w5, [x4, x3] 2822e0: 91000463 add x3, x3, #0x1 2822e4: 17ffffed b 282298 <memcpy+0x1c> I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in the .config (but leaving it disabled in SPL/TPL) and it results in Synchronous Abort: U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000) Trying to boot from MMC1 NOTICE: BL31: v1.3(release):8f40012ab NOTICE: BL31: Built : 14:20:53, Feb 16 2023 NOTICE: BL31: Rockchip release version: v1.1 INFO: GICv3 with legacy support detected. ARM GICV3 driver initialized in EL3 INFO: Using opteed sec cpu_context! INFO: boot cpu mask: 0 INFO: plat_rockchip_pmu_init(1203): pd status 3e INFO: BL31: Initializing runtime services WARNING: No OPTEE provided by BL2 boot loader, Booting device without OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK ERROR: Error initializing runtime service opteed_fast INFO: BL31: Preparing for EL3 exit to normal world INFO: Entry point address = 0x200000 INFO: SPSR = 0x3c9 "Synchronous Abort" handler, esr 0x96000021, far 0x2957e1 elr: 000000000020233c lr : 000000000026a388 x0 : 00000000002fbf38 x1 : 00000000002957e1 x2 : 0000000000000008 x3 : 0000000000000065 x4 : 00000000002957e9 x5 : 00000000002fbf40 x6 : 0000000000000065 x7 : 0000000000000003 x8 : 00000000002c7960 x9 : 000000000000ffd0 x10: 00000000002fbc5c x11: 00000000000132e8 x12: 00000000002fbce8 x13: 00000000002c7960 x14: 00000000002c7960 x15: 0000000000000000 x16: 0000000000000000 x17: 0000000000000000 x18: 00000000002fbe30 x19: 0000000000000007 x20: 00000000002957d8 x21: 0000000000000009 x22: 000000000029d189 x23: 0000000000000020 x24: 00000000002fbf38 x25: 00000000002957e7 x26: 00000000002957b2 x27: 0000000000007fff x28: 0000000000000000 x29: 00000000002fbcc0 Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026) Resetting CPU ... resetting ... > > Thanks, > Gao Xiang Regards, Jonathan
Hi Jonathan,
Could you please try the following patch ? It replaces all memcpy() calls
in lz4 with __builtin_memcpy().
diff --git a/lib/lz4.c b/lib/lz4.c
index d365dc727c..2afe31c1c3 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -34,6 +34,8 @@
#include <asm/unaligned.h>
#include <u-boot/lz4.h>
+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+
#define FORCE_INLINE inline __attribute__((always_inline))
static FORCE_INLINE u16 LZ4_readLE16(const void *src)
@@ -215,7 +217,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
&& likely((endOnInput ? ip < shortiend : 1) &
(op <= shortoend))) {
/* Copy the literals */
- memcpy(op, ip, endOnInput ? 16 : 8);
+ LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
op += length; ip += length;
/*
@@ -234,9 +236,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
(offset >= 8) &&
(dict == withPrefix64k || match >= lowPrefix)) {
/* Copy the match. */
- memcpy(op + 0, match + 0, 8);
- memcpy(op + 8, match + 8, 8);
- memcpy(op + 16, match + 16, 2);
+ LZ4_memcpy(op + 0, match + 0, 8);
+ LZ4_memcpy(op + 8, match + 8, 8);
+ LZ4_memcpy(op + 16, match + 16, 2);
op += length + MINMATCH;
/* Both stages worked, load the next token. */
continue;
@@ -416,7 +418,7 @@ _copy_match:
size_t const copySize = (size_t)(lowPrefix - match);
size_t const restSize = length - copySize;
- memcpy(op, dictEnd - copySize, copySize);
+ LZ4_memcpy(op, dictEnd - copySize, copySize);
op += copySize;
if (restSize > (size_t)(op - lowPrefix)) {
/* overlap copy */
@@ -426,7 +428,7 @@ _copy_match:
while (op < endOfMatch)
*op++ = *copyFrom++;
} else {
- memcpy(op, lowPrefix, restSize);
+ LZ4_memcpy(op, lowPrefix, restSize);
op += restSize;
}
}
@@ -452,7 +454,7 @@ _copy_match:
while (op < copyEnd)
*op++ = *match++;
} else {
- memcpy(op, match, mlen);
+ LZ4_memcpy(op, match, mlen);
}
op = copyEnd;
if (op == oend)
@@ -466,7 +468,7 @@ _copy_match:
op[2] = match[2];
op[3] = match[3];
match += inc32table[offset];
- memcpy(op + 4, match, 4);
+ LZ4_memcpy(op + 4, match, 4);
match -= dec64table[offset];
} else {
LZ4_copy8(op, match);
diff --git a/lib/lz4_wrapper.c b/lib/lz4_wrapper.c
index 4d48e7b0e8..e09c8d7057 100644
--- a/lib/lz4_wrapper.c
+++ b/lib/lz4_wrapper.c
@@ -80,7 +80,7 @@ int ulz4fn(const void *src, size_t srcn, void *dst,
size_t *dstn)
if (block_header & LZ4F_BLOCKUNCOMPRESSED_FLAG) {
size_t size = min((ptrdiff_t)block_size, (ptrdiff_t)(end - out));
- memcpy(out, in, size);
+ LZ4_memcpy(out, in, size);
out += size;
if (size < block_size) {
ret = -ENOBUFS; /* output overrun */
Thanks,
Jianan
On 2024/5/26 16:06, Jonathan Liu wrote:
Hi Gao,
On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiangkao@linux.alibaba.com>
<hsiangkao@linux.alibaba.com> wrote:
Hi,
On 2024/5/24 22:26, Jonathan Liu wrote:
Hi Jianan,
On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuang95@gmail.com>
<jnhuang95@gmail.com> wrote:
Update the LZ4 compression module based on LZ4 v1.8.3 in order to
use the newest LZ4_decompress_safe_partial() which can now decode
exactly the nb of bytes requested.
Signed-off-by: Huang Jianan <jnhuang95@gmail.com> <jnhuang95@gmail.com>
I noticed after this commit LZ4 decompression is slower.
ulz4fn function call takes 1.209670 seconds with this commit.
After reverting this commit, the ulz4fn function call takes 0.587032 seconds.
I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4
using -9 option for maximum compression) on RK3399.
Any ideas why it is slower with this commit and how the performance
regression can be fixed?
Just the quick glance, I think the issue may be due to memcpy/memmove
since it seems the main difference between these two codebases
(I'm not sure which LZ4 version the old codebase was based on) and
the new version mainly relies on memcpy/memmove instead of its own
versions.
Would you mind to check the assembly how memcpy/memset is generated
on your platform?
Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto):
000000000028220c <memset>:
#if !CONFIG_IS_ENABLED(TINY_MEMSET)
unsigned long cl = 0;
int i;
/* do it one word at a time (32 bits or 64 bits) while possible */
if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) {
28220c: f2400803 ands x3, x0, #0x7
282210: 540002c1 b.ne 282268 <memset+0x5c> // b.any
for (i = 0; i < sizeof(*sl); i++) {
cl <<= 8;
cl |= c & 0xff;
282214: 92401c26 and x6, x1, #0xff
unsigned long cl = 0;
282218: d2800004 mov x4, #0x0 // #0
28221c: 52800105 mov w5, #0x8 // #8
cl |= c & 0xff;
282220: aa0420c4 orr x4, x6, x4, lsl #8
for (i = 0; i < sizeof(*sl); i++) {
282224: 710004a5 subs w5, w5, #0x1
282228: 54ffffc1 b.ne 282220 <memset+0x14> // b.any
}
while (count >= sizeof(*sl)) {
28222c: cb030045 sub x5, x2, x3
282230: f1001cbf cmp x5, #0x7
282234: 54000148 b.hi 28225c <memset+0x50> // b.pmore
282238: d343fc43 lsr x3, x2, #3
28223c: 928000e4 mov x4, #0xfffffffffffffff8 // #-8
282240: 9b047c63 mul x3, x3, x4
282244: 8b030042 add x2, x2, x3
282248: cb030003 sub x3, x0, x3
unsigned long *sl = (unsigned long *) s;
28224c: d2800004 mov x4, #0x0 // #0
count -= sizeof(*sl);
}
}
#endif /* fill 8 bits at a time */
s8 = (char *)sl;
while (count--)
282250: eb04005f cmp x2, x4
282254: 540000e1 b.ne 282270 <memset+0x64> // b.any
*s8++ = c;
return s;
}
282258: d65f03c0 ret
*sl++ = cl;
28225c: f8236804 str x4, [x0, x3]
count -= sizeof(*sl);
282260: 91002063 add x3, x3, #0x8
282264: 17fffff2 b 28222c <memset+0x20>
unsigned long *sl = (unsigned long *) s;
282268: aa0003e3 mov x3, x0
28226c: 17fffff8 b 28224c <memset+0x40>
*s8++ = c;
282270: 38246861 strb w1, [x3, x4]
282274: 91000484 add x4, x4, #0x1
282278: 17fffff6 b 282250 <memset+0x44>
000000000028227c <memcpy>:
__used void * memcpy(void *dest, const void *src, size_t count)
{
unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src;
char *d8, *s8;
if (src == dest)
28227c: eb01001f cmp x0, x1
282280: 54000100 b.eq 2822a0 <memcpy+0x24> // b.none
return dest;
/* while all data is aligned (common case), copy a word at a time */
if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) {
282284: aa010003 orr x3, x0, x1
282288: f2400863 ands x3, x3, #0x7
28228c: 54000120 b.eq 2822b0 <memcpy+0x34> // b.none
282290: aa0003e4 mov x4, x0
282294: d2800003 mov x3, #0x0 // #0
}
}
/* copy the reset one byte at a time */
d8 = (char *)dl;
s8 = (char *)sl;
while (count--)
282298: eb03005f cmp x2, x3
28229c: 540001e1 b.ne 2822d8 <memcpy+0x5c> // b.any
*d8++ = *s8++;
return dest;
}
2822a0: d65f03c0 ret
*dl++ = *sl++;
2822a4: f8636824 ldr x4, [x1, x3]
2822a8: f8236804 str x4, [x0, x3]
count -= sizeof(*dl);
2822ac: 91002063 add x3, x3, #0x8
while (count >= sizeof(*dl)) {
2822b0: cb030044 sub x4, x2, x3
2822b4: f1001c9f cmp x4, #0x7
2822b8: 54ffff68 b.hi 2822a4 <memcpy+0x28> // b.pmore
2822bc: d343fc43 lsr x3, x2, #3
2822c0: 928000e4 mov x4, #0xfffffffffffffff8 // #-8
2822c4: 9b047c63 mul x3, x3, x4
2822c8: 8b030042 add x2, x2, x3
2822cc: cb030004 sub x4, x0, x3
2822d0: cb030021 sub x1, x1, x3
2822d4: 17fffff0 b 282294 <memcpy+0x18>
*d8++ = *s8++;
2822d8: 38636825 ldrb w5, [x1, x3]
2822dc: 38236885 strb w5, [x4, x3]
2822e0: 91000463 add x3, x3, #0x1
2822e4: 17ffffed b 282298 <memcpy+0x1c>
I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in
the .config (but leaving it disabled in SPL/TPL) and it results in
Synchronous Abort:
U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000)
Trying to boot from MMC1
NOTICE: BL31: v1.3(release):8f40012ab
NOTICE: BL31: Built : 14:20:53, Feb 16 2023
NOTICE: BL31: Rockchip release version: v1.1
INFO: GICv3 with legacy support detected. ARM GICV3 driver initialized in EL3
INFO: Using opteed sec cpu_context!
INFO: boot cpu mask: 0
INFO: plat_rockchip_pmu_init(1203): pd status 3e
INFO: BL31: Initializing runtime services
WARNING: No OPTEE provided by BL2 boot loader, Booting device without
OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK
ERROR: Error initializing runtime service opteed_fast
INFO: BL31: Preparing for EL3 exit to normal world
INFO: Entry point address = 0x200000
INFO: SPSR = 0x3c9
"Synchronous Abort" handler, esr 0x96000021, far 0x2957e1
elr: 000000000020233c lr : 000000000026a388
x0 : 00000000002fbf38 x1 : 00000000002957e1
x2 : 0000000000000008 x3 : 0000000000000065
x4 : 00000000002957e9 x5 : 00000000002fbf40
x6 : 0000000000000065 x7 : 0000000000000003
x8 : 00000000002c7960 x9 : 000000000000ffd0
x10: 00000000002fbc5c x11: 00000000000132e8
x12: 00000000002fbce8 x13: 00000000002c7960
x14: 00000000002c7960 x15: 0000000000000000
x16: 0000000000000000 x17: 0000000000000000
x18: 00000000002fbe30 x19: 0000000000000007
x20: 00000000002957d8 x21: 0000000000000009
x22: 000000000029d189 x23: 0000000000000020
x24: 00000000002fbf38 x25: 00000000002957e7
x26: 00000000002957b2 x27: 0000000000007fff
x28: 0000000000000000 x29: 00000000002fbcc0
Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026)
Resetting CPU ...
resetting ...
Thanks,
Gao Xiang
Regards,
Jonathan
Hi Jianan, Here are the LZ4 decompression times I got running "time unlz4 ..." on Rock 4 SE with RK3399 CPU. v2024.04: 1.329 seconds v2024.04 with 26c7fdadcb7 ("lib/lz4: update LZ4 decompressor module") reverted: 0.665 seconds v2024.04 with your patch: 1.216 seconds I managed to get better performance by optimizing it myself. v2024.04 with my patch: 0.785 seconds With my patch, it makes no difference if I use __builtin_memcpy or memcpy in lz4.c and lz4_wrapper.c so I just left it using memcpy. It is still slower than reverting the LZ4 update though. My patch: --- a/lib/lz4.c +++ b/lib/lz4.c @@ -41,6 +41,16 @@ static FORCE_INLINE u16 LZ4_readLE16(const void *src) return get_unaligned_le16(src); } +static FORCE_INLINE void LZ4_copy2(void *dst, const void *src) +{ + put_unaligned(get_unaligned((const u16 *)src), (u16 *)dst); +} + +static FORCE_INLINE void LZ4_copy4(void *dst, const void *src) +{ + put_unaligned(get_unaligned((const u32 *)src), (u32 *)dst); +} + static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) { put_unaligned(get_unaligned((const u64 *)src), (u64 *)dst); @@ -215,7 +225,10 @@ static FORCE_INLINE int LZ4_decompress_generic( && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend))) { /* Copy the literals */ - memcpy(op, ip, endOnInput ? 16 : 8); + LZ4_copy8(op, ip); + if (endOnInput) + LZ4_copy8(op + 8, ip + 8); + op += length; ip += length; /* @@ -234,9 +247,9 @@ static FORCE_INLINE int LZ4_decompress_generic( (offset >= 8) && (dict == withPrefix64k || match >= lowPrefix)) { /* Copy the match. */ - memcpy(op + 0, match + 0, 8); - memcpy(op + 8, match + 8, 8); - memcpy(op + 16, match + 16, 2); + LZ4_copy8(op, match); + LZ4_copy8(op + 8, match + 8); + LZ4_copy2(op + 16, match + 16); op += length + MINMATCH; /* Both stages worked, load the next token. */ continue; @@ -466,7 +479,7 @@ _copy_match: op[2] = match[2]; op[3] = match[3]; match += inc32table[offset]; - memcpy(op + 4, match, 4); + LZ4_copy4(op + 4, match); match -= dec64table[offset]; } else { LZ4_copy8(op, match); Let me know if you have any further suggestions. Thanks. Regards, Jonathan On Sun, 26 May 2024 at 22:18, Jianan Huang <jnhuang95@gmail.com> wrote: > > Hi Jonathan, > > Could you please try the following patch ? It replaces all memcpy() calls in lz4 with __builtin_memcpy(). > > diff --git a/lib/lz4.c b/lib/lz4.c > index d365dc727c..2afe31c1c3 100644 > --- a/lib/lz4.c > +++ b/lib/lz4.c > @@ -34,6 +34,8 @@ > #include <asm/unaligned.h> > #include <u-boot/lz4.h> > > +#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) > + > #define FORCE_INLINE inline __attribute__((always_inline)) > > static FORCE_INLINE u16 LZ4_readLE16(const void *src) > @@ -215,7 +217,7 @@ static FORCE_INLINE int LZ4_decompress_generic( > && likely((endOnInput ? ip < shortiend : 1) & > (op <= shortoend))) { > /* Copy the literals */ > - memcpy(op, ip, endOnInput ? 16 : 8); > + LZ4_memcpy(op, ip, endOnInput ? 16 : 8); > op += length; ip += length; > > /* > @@ -234,9 +236,9 @@ static FORCE_INLINE int LZ4_decompress_generic( > (offset >= 8) && > (dict == withPrefix64k || match >= lowPrefix)) { > /* Copy the match. */ > - memcpy(op + 0, match + 0, 8); > - memcpy(op + 8, match + 8, 8); > - memcpy(op + 16, match + 16, 2); > + LZ4_memcpy(op + 0, match + 0, 8); > + LZ4_memcpy(op + 8, match + 8, 8); > + LZ4_memcpy(op + 16, match + 16, 2); > op += length + MINMATCH; > /* Both stages worked, load the next token. */ > continue; > @@ -416,7 +418,7 @@ _copy_match: > size_t const copySize = (size_t)(lowPrefix - match); > size_t const restSize = length - copySize; > > - memcpy(op, dictEnd - copySize, copySize); > + LZ4_memcpy(op, dictEnd - copySize, copySize); > op += copySize; > if (restSize > (size_t)(op - lowPrefix)) { > /* overlap copy */ > @@ -426,7 +428,7 @@ _copy_match: > while (op < endOfMatch) > *op++ = *copyFrom++; > } else { > - memcpy(op, lowPrefix, restSize); > + LZ4_memcpy(op, lowPrefix, restSize); > op += restSize; > } > } > @@ -452,7 +454,7 @@ _copy_match: > while (op < copyEnd) > *op++ = *match++; > } else { > - memcpy(op, match, mlen); > + LZ4_memcpy(op, match, mlen); > } > op = copyEnd; > if (op == oend) > @@ -466,7 +468,7 @@ _copy_match: > op[2] = match[2]; > op[3] = match[3]; > match += inc32table[offset]; > - memcpy(op + 4, match, 4); > + LZ4_memcpy(op + 4, match, 4); > match -= dec64table[offset]; > } else { > LZ4_copy8(op, match); > diff --git a/lib/lz4_wrapper.c b/lib/lz4_wrapper.c > index 4d48e7b0e8..e09c8d7057 100644 > --- a/lib/lz4_wrapper.c > +++ b/lib/lz4_wrapper.c > @@ -80,7 +80,7 @@ int ulz4fn(const void *src, size_t srcn, void *dst, size_t *dstn) > > if (block_header & LZ4F_BLOCKUNCOMPRESSED_FLAG) { > size_t size = min((ptrdiff_t)block_size, (ptrdiff_t)(end - out)); > - memcpy(out, in, size); > + LZ4_memcpy(out, in, size); > out += size; > if (size < block_size) { > ret = -ENOBUFS; /* output overrun */ > > > Thanks, > > Jianan > > On 2024/5/26 16:06, Jonathan Liu wrote: > > Hi Gao, > > On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiangkao@linux.alibaba.com> wrote: > > Hi, > > On 2024/5/24 22:26, Jonathan Liu wrote: > > Hi Jianan, > > On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuang95@gmail.com> wrote: > > Update the LZ4 compression module based on LZ4 v1.8.3 in order to > use the newest LZ4_decompress_safe_partial() which can now decode > exactly the nb of bytes requested. > > Signed-off-by: Huang Jianan <jnhuang95@gmail.com> > > I noticed after this commit LZ4 decompression is slower. > ulz4fn function call takes 1.209670 seconds with this commit. > After reverting this commit, the ulz4fn function call takes 0.587032 seconds. > > I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4 > using -9 option for maximum compression) on RK3399. > > Any ideas why it is slower with this commit and how the performance > regression can be fixed? > > Just the quick glance, I think the issue may be due to memcpy/memmove > since it seems the main difference between these two codebases > (I'm not sure which LZ4 version the old codebase was based on) and > the new version mainly relies on memcpy/memmove instead of its own > versions. > > Would you mind to check the assembly how memcpy/memset is generated > on your platform? > > Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto): > 000000000028220c <memset>: > #if !CONFIG_IS_ENABLED(TINY_MEMSET) > unsigned long cl = 0; > int i; > > /* do it one word at a time (32 bits or 64 bits) while possible */ > if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) { > 28220c: f2400803 ands x3, x0, #0x7 > 282210: 540002c1 b.ne 282268 <memset+0x5c> // b.any > for (i = 0; i < sizeof(*sl); i++) { > cl <<= 8; > cl |= c & 0xff; > 282214: 92401c26 and x6, x1, #0xff > unsigned long cl = 0; > 282218: d2800004 mov x4, #0x0 // #0 > 28221c: 52800105 mov w5, #0x8 // #8 > cl |= c & 0xff; > 282220: aa0420c4 orr x4, x6, x4, lsl #8 > for (i = 0; i < sizeof(*sl); i++) { > 282224: 710004a5 subs w5, w5, #0x1 > 282228: 54ffffc1 b.ne 282220 <memset+0x14> // b.any > } > while (count >= sizeof(*sl)) { > 28222c: cb030045 sub x5, x2, x3 > 282230: f1001cbf cmp x5, #0x7 > 282234: 54000148 b.hi 28225c <memset+0x50> // b.pmore > 282238: d343fc43 lsr x3, x2, #3 > 28223c: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 > 282240: 9b047c63 mul x3, x3, x4 > 282244: 8b030042 add x2, x2, x3 > 282248: cb030003 sub x3, x0, x3 > unsigned long *sl = (unsigned long *) s; > 28224c: d2800004 mov x4, #0x0 // #0 > count -= sizeof(*sl); > } > } > #endif /* fill 8 bits at a time */ > s8 = (char *)sl; > while (count--) > 282250: eb04005f cmp x2, x4 > 282254: 540000e1 b.ne 282270 <memset+0x64> // b.any > *s8++ = c; > > return s; > } > 282258: d65f03c0 ret > *sl++ = cl; > 28225c: f8236804 str x4, [x0, x3] > count -= sizeof(*sl); > 282260: 91002063 add x3, x3, #0x8 > 282264: 17fffff2 b 28222c <memset+0x20> > unsigned long *sl = (unsigned long *) s; > 282268: aa0003e3 mov x3, x0 > 28226c: 17fffff8 b 28224c <memset+0x40> > *s8++ = c; > 282270: 38246861 strb w1, [x3, x4] > 282274: 91000484 add x4, x4, #0x1 > 282278: 17fffff6 b 282250 <memset+0x44> > > 000000000028227c <memcpy>: > __used void * memcpy(void *dest, const void *src, size_t count) > { > unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src; > char *d8, *s8; > > if (src == dest) > 28227c: eb01001f cmp x0, x1 > 282280: 54000100 b.eq 2822a0 <memcpy+0x24> // b.none > return dest; > > /* while all data is aligned (common case), copy a word at a time */ > if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) { > 282284: aa010003 orr x3, x0, x1 > 282288: f2400863 ands x3, x3, #0x7 > 28228c: 54000120 b.eq 2822b0 <memcpy+0x34> // b.none > 282290: aa0003e4 mov x4, x0 > 282294: d2800003 mov x3, #0x0 // #0 > } > } > /* copy the reset one byte at a time */ > d8 = (char *)dl; > s8 = (char *)sl; > while (count--) > 282298: eb03005f cmp x2, x3 > 28229c: 540001e1 b.ne 2822d8 <memcpy+0x5c> // b.any > *d8++ = *s8++; > > return dest; > } > 2822a0: d65f03c0 ret > *dl++ = *sl++; > 2822a4: f8636824 ldr x4, [x1, x3] > 2822a8: f8236804 str x4, [x0, x3] > count -= sizeof(*dl); > 2822ac: 91002063 add x3, x3, #0x8 > while (count >= sizeof(*dl)) { > 2822b0: cb030044 sub x4, x2, x3 > 2822b4: f1001c9f cmp x4, #0x7 > 2822b8: 54ffff68 b.hi 2822a4 <memcpy+0x28> // b.pmore > 2822bc: d343fc43 lsr x3, x2, #3 > 2822c0: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 > 2822c4: 9b047c63 mul x3, x3, x4 > 2822c8: 8b030042 add x2, x2, x3 > 2822cc: cb030004 sub x4, x0, x3 > 2822d0: cb030021 sub x1, x1, x3 > 2822d4: 17fffff0 b 282294 <memcpy+0x18> > *d8++ = *s8++; > 2822d8: 38636825 ldrb w5, [x1, x3] > 2822dc: 38236885 strb w5, [x4, x3] > 2822e0: 91000463 add x3, x3, #0x1 > 2822e4: 17ffffed b 282298 <memcpy+0x1c> > > > I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in > the .config (but leaving it disabled in SPL/TPL) and it results in > Synchronous Abort: > U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000) > Trying to boot from MMC1 > NOTICE: BL31: v1.3(release):8f40012ab > NOTICE: BL31: Built : 14:20:53, Feb 16 2023 > NOTICE: BL31: Rockchip release version: v1.1 > INFO: GICv3 with legacy support detected. ARM GICV3 driver initialized in EL3 > INFO: Using opteed sec cpu_context! > INFO: boot cpu mask: 0 > INFO: plat_rockchip_pmu_init(1203): pd status 3e > INFO: BL31: Initializing runtime services > WARNING: No OPTEE provided by BL2 boot loader, Booting device without > OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK > ERROR: Error initializing runtime service opteed_fast > INFO: BL31: Preparing for EL3 exit to normal world > INFO: Entry point address = 0x200000 > INFO: SPSR = 0x3c9 > "Synchronous Abort" handler, esr 0x96000021, far 0x2957e1 > elr: 000000000020233c lr : 000000000026a388 > x0 : 00000000002fbf38 x1 : 00000000002957e1 > x2 : 0000000000000008 x3 : 0000000000000065 > x4 : 00000000002957e9 x5 : 00000000002fbf40 > x6 : 0000000000000065 x7 : 0000000000000003 > x8 : 00000000002c7960 x9 : 000000000000ffd0 > x10: 00000000002fbc5c x11: 00000000000132e8 > x12: 00000000002fbce8 x13: 00000000002c7960 > x14: 00000000002c7960 x15: 0000000000000000 > x16: 0000000000000000 x17: 0000000000000000 > x18: 00000000002fbe30 x19: 0000000000000007 > x20: 00000000002957d8 x21: 0000000000000009 > x22: 000000000029d189 x23: 0000000000000020 > x24: 00000000002fbf38 x25: 00000000002957e7 > x26: 00000000002957b2 x27: 0000000000007fff > x28: 0000000000000000 x29: 00000000002fbcc0 > > Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026) > Resetting CPU ... > > resetting ... > > Thanks, > Gao Xiang > > Regards, > Jonathan
On 2024/5/28 21:28, Jonathan Liu wrote: > Hi Jianan, > > Here are the LZ4 decompression times I got running "time unlz4 ..." on > Rock 4 SE with RK3399 CPU. > > v2024.04: 1.329 seconds > v2024.04 with 26c7fdadcb7 ("lib/lz4: update LZ4 decompressor module") > reverted: 0.665 seconds > v2024.04 with your patch: 1.216 seconds > > I managed to get better performance by optimizing it myself. > v2024.04 with my patch: 0.785 seconds > > With my patch, it makes no difference if I use __builtin_memcpy or > memcpy in lz4.c and lz4_wrapper.c so I just left it using memcpy. > It is still slower than reverting the LZ4 update though. I'm not sure what's the old version come from, but from the copyright itself it seems some earlier 2015 version. Note that there is no absolute outperform between old version and new version, old version may be lack of some necessary boundary check or lead to some msan warning or something, like a new commit of year 2016: https://github.com/lz4/lz4/commit/f094f531441140f10fd461ba769f49d10f5cd581 /* costs ~1%; silence an msan warning when offset == 0 */ /* * note : when partialDecoding, there is no guarantee that * at least 4 bytes remain available in output buffer */ if (!partialDecoding) { assert(oend > op); assert(oend - op >= 4); LZ4_write32(op, (U32)offset); } For the example above, you could completely remove the line above if you don't care about such warning, as it claims ~1% performance loss. Also since no u-boot user uses in-place decompression and if you memmove() doesn't behave well, you could update the following line /* * supports overlapping memory regions; only matters * for in-place decompression scenarios */ memmove(op, ip, length); into memcpy() instead. The new lz4 codebase relies on memcpy() / memmove() code optimization more than old version, if memcpy() assembly doesn't generate well on your platform, it might have some issue. Thanks, Gao Xiang > > My patch: > --- a/lib/lz4.c > +++ b/lib/lz4.c > @@ -41,6 +41,16 @@ static FORCE_INLINE u16 LZ4_readLE16(const void *src) > return get_unaligned_le16(src); > } > > +static FORCE_INLINE void LZ4_copy2(void *dst, const void *src) > +{ > + put_unaligned(get_unaligned((const u16 *)src), (u16 *)dst); > +} > + > +static FORCE_INLINE void LZ4_copy4(void *dst, const void *src) > +{ > + put_unaligned(get_unaligned((const u32 *)src), (u32 *)dst); > +} > + > static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) > { > put_unaligned(get_unaligned((const u64 *)src), (u64 *)dst); > @@ -215,7 +225,10 @@ static FORCE_INLINE int LZ4_decompress_generic( > && likely((endOnInput ? ip < shortiend : 1) & > (op <= shortoend))) { > /* Copy the literals */ > - memcpy(op, ip, endOnInput ? 16 : 8); > + LZ4_copy8(op, ip); > + if (endOnInput) > + LZ4_copy8(op + 8, ip + 8); > + > op += length; ip += length; > > /* > @@ -234,9 +247,9 @@ static FORCE_INLINE int LZ4_decompress_generic( > (offset >= 8) && > (dict == withPrefix64k || match >= lowPrefix)) { > /* Copy the match. */ > - memcpy(op + 0, match + 0, 8); > - memcpy(op + 8, match + 8, 8); > - memcpy(op + 16, match + 16, 2); > + LZ4_copy8(op, match); > + LZ4_copy8(op + 8, match + 8); > + LZ4_copy2(op + 16, match + 16); > op += length + MINMATCH; > /* Both stages worked, load the next token. */ > continue; > @@ -466,7 +479,7 @@ _copy_match: > op[2] = match[2]; > op[3] = match[3]; > match += inc32table[offset]; > - memcpy(op + 4, match, 4); > + LZ4_copy4(op + 4, match); > match -= dec64table[offset]; > } else { > LZ4_copy8(op, match); > > Let me know if you have any further suggestions. > > Thanks. > > Regards, > Jonathan > > On Sun, 26 May 2024 at 22:18, Jianan Huang <jnhuang95@gmail.com> wrote: >> >> Hi Jonathan, >> >> Could you please try the following patch ? It replaces all memcpy() calls in lz4 with __builtin_memcpy(). >> >> diff --git a/lib/lz4.c b/lib/lz4.c >> index d365dc727c..2afe31c1c3 100644 >> --- a/lib/lz4.c >> +++ b/lib/lz4.c >> @@ -34,6 +34,8 @@ >> #include <asm/unaligned.h> >> #include <u-boot/lz4.h> >> >> +#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) >> + >> #define FORCE_INLINE inline __attribute__((always_inline)) >> >> static FORCE_INLINE u16 LZ4_readLE16(const void *src) >> @@ -215,7 +217,7 @@ static FORCE_INLINE int LZ4_decompress_generic( >> && likely((endOnInput ? ip < shortiend : 1) & >> (op <= shortoend))) { >> /* Copy the literals */ >> - memcpy(op, ip, endOnInput ? 16 : 8); >> + LZ4_memcpy(op, ip, endOnInput ? 16 : 8); >> op += length; ip += length; >> >> /* >> @@ -234,9 +236,9 @@ static FORCE_INLINE int LZ4_decompress_generic( >> (offset >= 8) && >> (dict == withPrefix64k || match >= lowPrefix)) { >> /* Copy the match. */ >> - memcpy(op + 0, match + 0, 8); >> - memcpy(op + 8, match + 8, 8); >> - memcpy(op + 16, match + 16, 2); >> + LZ4_memcpy(op + 0, match + 0, 8); >> + LZ4_memcpy(op + 8, match + 8, 8); >> + LZ4_memcpy(op + 16, match + 16, 2); >> op += length + MINMATCH; >> /* Both stages worked, load the next token. */ >> continue; >> @@ -416,7 +418,7 @@ _copy_match: >> size_t const copySize = (size_t)(lowPrefix - match); >> size_t const restSize = length - copySize; >> >> - memcpy(op, dictEnd - copySize, copySize); >> + LZ4_memcpy(op, dictEnd - copySize, copySize); >> op += copySize; >> if (restSize > (size_t)(op - lowPrefix)) { >> /* overlap copy */ >> @@ -426,7 +428,7 @@ _copy_match: >> while (op < endOfMatch) >> *op++ = *copyFrom++; >> } else { >> - memcpy(op, lowPrefix, restSize); >> + LZ4_memcpy(op, lowPrefix, restSize); >> op += restSize; >> } >> } >> @@ -452,7 +454,7 @@ _copy_match: >> while (op < copyEnd) >> *op++ = *match++; >> } else { >> - memcpy(op, match, mlen); >> + LZ4_memcpy(op, match, mlen); >> } >> op = copyEnd; >> if (op == oend) >> @@ -466,7 +468,7 @@ _copy_match: >> op[2] = match[2]; >> op[3] = match[3]; >> match += inc32table[offset]; >> - memcpy(op + 4, match, 4); >> + LZ4_memcpy(op + 4, match, 4); >> match -= dec64table[offset]; >> } else { >> LZ4_copy8(op, match); >> diff --git a/lib/lz4_wrapper.c b/lib/lz4_wrapper.c >> index 4d48e7b0e8..e09c8d7057 100644 >> --- a/lib/lz4_wrapper.c >> +++ b/lib/lz4_wrapper.c >> @@ -80,7 +80,7 @@ int ulz4fn(const void *src, size_t srcn, void *dst, size_t *dstn) >> >> if (block_header & LZ4F_BLOCKUNCOMPRESSED_FLAG) { >> size_t size = min((ptrdiff_t)block_size, (ptrdiff_t)(end - out)); >> - memcpy(out, in, size); >> + LZ4_memcpy(out, in, size); >> out += size; >> if (size < block_size) { >> ret = -ENOBUFS; /* output overrun */ >> >> >> Thanks, >> >> Jianan >> >> On 2024/5/26 16:06, Jonathan Liu wrote: >> >> Hi Gao, >> >> On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiangkao@linux.alibaba.com> wrote: >> >> Hi, >> >> On 2024/5/24 22:26, Jonathan Liu wrote: >> >> Hi Jianan, >> >> On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuang95@gmail.com> wrote: >> >> Update the LZ4 compression module based on LZ4 v1.8.3 in order to >> use the newest LZ4_decompress_safe_partial() which can now decode >> exactly the nb of bytes requested. >> >> Signed-off-by: Huang Jianan <jnhuang95@gmail.com> >> >> I noticed after this commit LZ4 decompression is slower. >> ulz4fn function call takes 1.209670 seconds with this commit. >> After reverting this commit, the ulz4fn function call takes 0.587032 seconds. >> >> I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4 >> using -9 option for maximum compression) on RK3399. >> >> Any ideas why it is slower with this commit and how the performance >> regression can be fixed? >> >> Just the quick glance, I think the issue may be due to memcpy/memmove >> since it seems the main difference between these two codebases >> (I'm not sure which LZ4 version the old codebase was based on) and >> the new version mainly relies on memcpy/memmove instead of its own >> versions. >> >> Would you mind to check the assembly how memcpy/memset is generated >> on your platform? >> >> Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto): >> 000000000028220c <memset>: >> #if !CONFIG_IS_ENABLED(TINY_MEMSET) >> unsigned long cl = 0; >> int i; >> >> /* do it one word at a time (32 bits or 64 bits) while possible */ >> if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) { >> 28220c: f2400803 ands x3, x0, #0x7 >> 282210: 540002c1 b.ne 282268 <memset+0x5c> // b.any >> for (i = 0; i < sizeof(*sl); i++) { >> cl <<= 8; >> cl |= c & 0xff; >> 282214: 92401c26 and x6, x1, #0xff >> unsigned long cl = 0; >> 282218: d2800004 mov x4, #0x0 // #0 >> 28221c: 52800105 mov w5, #0x8 // #8 >> cl |= c & 0xff; >> 282220: aa0420c4 orr x4, x6, x4, lsl #8 >> for (i = 0; i < sizeof(*sl); i++) { >> 282224: 710004a5 subs w5, w5, #0x1 >> 282228: 54ffffc1 b.ne 282220 <memset+0x14> // b.any >> } >> while (count >= sizeof(*sl)) { >> 28222c: cb030045 sub x5, x2, x3 >> 282230: f1001cbf cmp x5, #0x7 >> 282234: 54000148 b.hi 28225c <memset+0x50> // b.pmore >> 282238: d343fc43 lsr x3, x2, #3 >> 28223c: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 >> 282240: 9b047c63 mul x3, x3, x4 >> 282244: 8b030042 add x2, x2, x3 >> 282248: cb030003 sub x3, x0, x3 >> unsigned long *sl = (unsigned long *) s; >> 28224c: d2800004 mov x4, #0x0 // #0 >> count -= sizeof(*sl); >> } >> } >> #endif /* fill 8 bits at a time */ >> s8 = (char *)sl; >> while (count--) >> 282250: eb04005f cmp x2, x4 >> 282254: 540000e1 b.ne 282270 <memset+0x64> // b.any >> *s8++ = c; >> >> return s; >> } >> 282258: d65f03c0 ret >> *sl++ = cl; >> 28225c: f8236804 str x4, [x0, x3] >> count -= sizeof(*sl); >> 282260: 91002063 add x3, x3, #0x8 >> 282264: 17fffff2 b 28222c <memset+0x20> >> unsigned long *sl = (unsigned long *) s; >> 282268: aa0003e3 mov x3, x0 >> 28226c: 17fffff8 b 28224c <memset+0x40> >> *s8++ = c; >> 282270: 38246861 strb w1, [x3, x4] >> 282274: 91000484 add x4, x4, #0x1 >> 282278: 17fffff6 b 282250 <memset+0x44> >> >> 000000000028227c <memcpy>: >> __used void * memcpy(void *dest, const void *src, size_t count) >> { >> unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src; >> char *d8, *s8; >> >> if (src == dest) >> 28227c: eb01001f cmp x0, x1 >> 282280: 54000100 b.eq 2822a0 <memcpy+0x24> // b.none >> return dest; >> >> /* while all data is aligned (common case), copy a word at a time */ >> if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) { >> 282284: aa010003 orr x3, x0, x1 >> 282288: f2400863 ands x3, x3, #0x7 >> 28228c: 54000120 b.eq 2822b0 <memcpy+0x34> // b.none >> 282290: aa0003e4 mov x4, x0 >> 282294: d2800003 mov x3, #0x0 // #0 >> } >> } >> /* copy the reset one byte at a time */ >> d8 = (char *)dl; >> s8 = (char *)sl; >> while (count--) >> 282298: eb03005f cmp x2, x3 >> 28229c: 540001e1 b.ne 2822d8 <memcpy+0x5c> // b.any >> *d8++ = *s8++; >> >> return dest; >> } >> 2822a0: d65f03c0 ret >> *dl++ = *sl++; >> 2822a4: f8636824 ldr x4, [x1, x3] >> 2822a8: f8236804 str x4, [x0, x3] >> count -= sizeof(*dl); >> 2822ac: 91002063 add x3, x3, #0x8 >> while (count >= sizeof(*dl)) { >> 2822b0: cb030044 sub x4, x2, x3 >> 2822b4: f1001c9f cmp x4, #0x7 >> 2822b8: 54ffff68 b.hi 2822a4 <memcpy+0x28> // b.pmore >> 2822bc: d343fc43 lsr x3, x2, #3 >> 2822c0: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 >> 2822c4: 9b047c63 mul x3, x3, x4 >> 2822c8: 8b030042 add x2, x2, x3 >> 2822cc: cb030004 sub x4, x0, x3 >> 2822d0: cb030021 sub x1, x1, x3 >> 2822d4: 17fffff0 b 282294 <memcpy+0x18> >> *d8++ = *s8++; >> 2822d8: 38636825 ldrb w5, [x1, x3] >> 2822dc: 38236885 strb w5, [x4, x3] >> 2822e0: 91000463 add x3, x3, #0x1 >> 2822e4: 17ffffed b 282298 <memcpy+0x1c> >> >> >> I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in >> the .config (but leaving it disabled in SPL/TPL) and it results in >> Synchronous Abort: >> U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000) >> Trying to boot from MMC1 >> NOTICE: BL31: v1.3(release):8f40012ab >> NOTICE: BL31: Built : 14:20:53, Feb 16 2023 >> NOTICE: BL31: Rockchip release version: v1.1 >> INFO: GICv3 with legacy support detected. ARM GICV3 driver initialized in EL3 >> INFO: Using opteed sec cpu_context! >> INFO: boot cpu mask: 0 >> INFO: plat_rockchip_pmu_init(1203): pd status 3e >> INFO: BL31: Initializing runtime services >> WARNING: No OPTEE provided by BL2 boot loader, Booting device without >> OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK >> ERROR: Error initializing runtime service opteed_fast >> INFO: BL31: Preparing for EL3 exit to normal world >> INFO: Entry point address = 0x200000 >> INFO: SPSR = 0x3c9 >> "Synchronous Abort" handler, esr 0x96000021, far 0x2957e1 >> elr: 000000000020233c lr : 000000000026a388 >> x0 : 00000000002fbf38 x1 : 00000000002957e1 >> x2 : 0000000000000008 x3 : 0000000000000065 >> x4 : 00000000002957e9 x5 : 00000000002fbf40 >> x6 : 0000000000000065 x7 : 0000000000000003 >> x8 : 00000000002c7960 x9 : 000000000000ffd0 >> x10: 00000000002fbc5c x11: 00000000000132e8 >> x12: 00000000002fbce8 x13: 00000000002c7960 >> x14: 00000000002c7960 x15: 0000000000000000 >> x16: 0000000000000000 x17: 0000000000000000 >> x18: 00000000002fbe30 x19: 0000000000000007 >> x20: 00000000002957d8 x21: 0000000000000009 >> x22: 000000000029d189 x23: 0000000000000020 >> x24: 00000000002fbf38 x25: 00000000002957e7 >> x26: 00000000002957b2 x27: 0000000000007fff >> x28: 0000000000000000 x29: 00000000002fbcc0 >> >> Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026) >> Resetting CPU ... >> >> resetting ... >> >> Thanks, >> Gao Xiang >> >> Regards, >> Jonathan
diff --git a/include/u-boot/lz4.h b/include/u-boot/lz4.h index e18b39a5dc..655adbfcd1 100644 --- a/include/u-boot/lz4.h +++ b/include/u-boot/lz4.h @@ -21,4 +21,53 @@ */ int ulz4fn(const void *src, size_t srcn, void *dst, size_t *dstn); +/** + * LZ4_decompress_safe() - Decompression protected against buffer overflow + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated + * @compressedSize: is the precise full size of the compressed block + * @maxDecompressedSize: is the size of 'dest' buffer + * + * Decompresses data from 'source' into 'dest'. + * If the source stream is detected malformed, the function will + * stop decoding and return a negative result. + * This function is protected against buffer overflow exploits, + * including malicious data packets. It never writes outside output buffer, + * nor reads outside input buffer. + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_safe(const char *source, char *dest, + int compressedSize, int maxDecompressedSize); + +/** + * LZ4_decompress_safe_partial() - Decompress a block of size 'compressedSize' + * at position 'source' into buffer 'dest' + * @source: source address of the compressed data + * @dest: output buffer address of the decompressed data which must be + * already allocated + * @compressedSize: is the precise full size of the compressed block. + * @targetOutputSize: the decompression operation will try + * to stop as soon as 'targetOutputSize' has been reached + * @maxDecompressedSize: is the size of destination buffer + * + * This function decompresses a compressed block of size 'compressedSize' + * at position 'source' into destination buffer 'dest' + * of size 'maxDecompressedSize'. + * The function tries to stop decompressing operation as soon as + * 'targetOutputSize' has been reached, reducing decompression time. + * This function never writes outside of output buffer, + * and never reads outside of input buffer. + * It is therefore protected against malicious data packets. + * + * Return: the number of bytes decoded in the destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + * + */ +int LZ4_decompress_safe_partial(const char *src, char *dst, + int compressedSize, int targetOutputSize, int dstCapacity); #endif diff --git a/lib/lz4.c b/lib/lz4.c index 046c34e390..5337842126 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -1,13 +1,63 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: GPL 2.0+ OR BSD-2-Clause /* - LZ4 - Fast LZ compression algorithm - Copyright (C) 2011-2015, Yann Collet. + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011 - 2016, Yann Collet. + * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php) + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 + */ +#include <common.h> +#include <compiler.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <asm/unaligned.h> +#include <u-boot/lz4.h> + +#define FORCE_INLINE inline __attribute__((always_inline)) + +static FORCE_INLINE u16 LZ4_readLE16(const void *src) +{ + return get_unaligned_le16(src); +} - You can contact the author at : - - LZ4 source repository : https://github.com/Cyan4973/lz4 - - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c -*/ +static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) +{ + put_unaligned(get_unaligned((const u64 *)src), (u64 *)dst); +} + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; +typedef uintptr_t uptrval; +static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value) +{ + put_unaligned(value, (U32 *)memPtr); +} /************************************** * Reading and writing into memory @@ -28,14 +78,17 @@ static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) **************************************/ #define MINMATCH 4 -#define COPYLENGTH 8 +#define WILDCOPYLENGTH 8 #define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH+MINMATCH) -static const int LZ4_minLength = (MFLIMIT+1); +#define MFLIMIT (WILDCOPYLENGTH + MINMATCH) -#define KB *(1 <<10) -#define MB *(1 <<20) -#define GB *(1U<<30) +/* + * ensure it's possible to write 2 x wildcopyLength + * without overflowing output buffer + */ +#define MATCH_SAFEGUARD_DISTANCE ((2 * WILDCOPYLENGTH) - MINMATCH) + +#define KB (1 <<10) #define MAXD_LOG 16 #define MAX_DISTANCE ((1 << MAXD_LOG) - 1) @@ -45,198 +98,438 @@ static const int LZ4_minLength = (MFLIMIT+1); #define RUN_BITS (8-ML_BITS) #define RUN_MASK ((1U<<RUN_BITS)-1) +#define LZ4_STATIC_ASSERT(c) BUILD_BUG_ON(!(c)) /************************************** * Local Structures and types **************************************/ typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; -typedef enum { full = 0, partial = 1 } earlyEnd_directive; +typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; +#define DEBUGLOG(l, ...) {} /* disabled */ +#ifndef assert +#define assert(condition) ((void)0) +#endif -/******************************* -* Decompression functions -*******************************/ /* - * This generic decompression function cover all use cases. - * It shall be instantiated several times, using different sets of directives - * Note that it is essential this generic function is really inlined, + * LZ4_decompress_generic() : + * This generic decompression function covers all use cases. + * It shall be instantiated several times, using different sets of directives. + * Note that it is important for performance that this function really get inlined, * in order to remove useless branches during compilation optimization. */ -FORCE_INLINE int LZ4_decompress_generic( - const char* const source, - char* const dest, - int inputSize, - int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ - - int endOnInput, /* endOnOutputSize, endOnInputSize */ - int partialDecoding, /* full, partial */ - int targetOutputSize, /* only used if partialDecoding==partial */ - int dict, /* noDict, withPrefix64k, usingExtDict */ - const BYTE* const lowPrefix, /* == dest if dict == noDict */ - const BYTE* const dictStart, /* only if dict==usingExtDict */ - const size_t dictSize /* note : = 0 if noDict */ - ) +static FORCE_INLINE int LZ4_decompress_generic( + const char * const src, + char * const dst, + int srcSize, + /* + * If endOnInput == endOnInputSize, + * this value is `dstCapacity` + */ + int outputSize, + /* endOnOutputSize, endOnInputSize */ + endCondition_directive endOnInput, + /* full, partial */ + earlyEnd_directive partialDecoding, + /* noDict, withPrefix64k, usingExtDict */ + dict_directive dict, + /* always <= dst, == dst when no prefix */ + const BYTE * const lowPrefix, + /* only if dict == usingExtDict */ + const BYTE * const dictStart, + /* note : = 0 if noDict */ + const size_t dictSize + ) { - /* Local Variables */ - const BYTE* ip = (const BYTE*) source; - const BYTE* const iend = ip + inputSize; - - BYTE* op = (BYTE*) dest; - BYTE* const oend = op + outputSize; - BYTE* cpy; - BYTE* oexit = op + targetOutputSize; - const BYTE* const lowLimit = lowPrefix - dictSize; - - const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; - const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4}; - const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; - - const int safeDecode = (endOnInput==endOnInputSize); - const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); - - - /* Special cases */ - if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ - if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ - if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); - - - /* Main Loop */ - while (1) - { - unsigned token; - size_t length; - const BYTE* match; - - /* get literal length */ - token = *ip++; - if ((length=(token>>ML_BITS)) == RUN_MASK) - { - unsigned s; - do - { - s = *ip++; - length += s; - } - while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255)); - if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error; /* overflow detection */ - if ((safeDecode) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error; /* overflow detection */ - } - - /* copy literals */ - cpy = op+length; - if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) - || ((!endOnInput) && (cpy>oend-COPYLENGTH))) - { - if (partialDecoding) - { - if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ - if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ - } - else - { - if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ - if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ - } - memcpy(op, ip, length); - ip += length; - op += length; - break; /* Necessarily EOF, due to parsing restrictions */ - } - LZ4_wildCopy(op, ip, cpy); - ip += length; op = cpy; - - /* get offset */ - match = cpy - LZ4_readLE16(ip); ip+=2; - if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ - - /* get matchlength */ - length = token & ML_MASK; - if (length == ML_MASK) - { - unsigned s; - do - { - if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; - s = *ip++; - length += s; - } while (s==255); - if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */ - } - length += MINMATCH; - - /* check external dictionary */ - if ((dict==usingExtDict) && (match < lowPrefix)) - { - if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ - - if (length <= (size_t)(lowPrefix-match)) - { - /* match can be copied as a single segment from external dictionary */ - match = dictEnd - (lowPrefix-match); - memmove(op, match, length); op += length; - } - else - { - /* match encompass external dictionary and current segment */ - size_t copySize = (size_t)(lowPrefix-match); - memcpy(op, dictEnd - copySize, copySize); - op += copySize; - copySize = length - copySize; - if (copySize > (size_t)(op-lowPrefix)) /* overlap within current segment */ - { - BYTE* const endOfMatch = op + copySize; - const BYTE* copyFrom = lowPrefix; - while (op < endOfMatch) *op++ = *copyFrom++; - } - else - { - memcpy(op, lowPrefix, copySize); - op += copySize; - } - } - continue; - } - - /* copy repeated sequence */ - cpy = op + length; - if (unlikely((op-match)<8)) - { - const size_t dec64 = dec64table[op-match]; - op[0] = match[0]; - op[1] = match[1]; - op[2] = match[2]; - op[3] = match[3]; - match += dec32table[op-match]; - LZ4_copy4(op+4, match); - op += 8; match -= dec64; - } else { LZ4_copy8(op, match); op+=8; match+=8; } - - if (unlikely(cpy>oend-12)) - { - if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ - if (op < oend-8) - { - LZ4_wildCopy(op, match, oend-8); - match += (oend-8) - op; - op = oend-8; - } - while (op<cpy) *op++ = *match++; - } - else - LZ4_wildCopy(op, match, cpy); - op=cpy; /* correction */ - } - - /* end of decoding */ - if (endOnInput) - return (int) (((char*)op)-dest); /* Nb of output bytes decoded */ - else - return (int) (((const char*)ip)-source); /* Nb of input bytes read */ - - /* Overflow error detected */ + const BYTE *ip = (const BYTE *) src; + const BYTE * const iend = ip + srcSize; + + BYTE *op = (BYTE *) dst; + BYTE * const oend = op + outputSize; + BYTE *cpy; + + const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize; + static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; + static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; + + const int safeDecode = (endOnInput == endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB))); + + /* Set up the "end" pointers for the shortcut. */ + const BYTE *const shortiend = iend - + (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; + const BYTE *const shortoend = oend - + (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; + + DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__, + srcSize, outputSize); + + /* Special cases */ + assert(lowPrefix <= op); + assert(src != NULL); + + /* Empty output buffer */ + if ((endOnInput) && (unlikely(outputSize == 0))) + return ((srcSize == 1) && (*ip == 0)) ? 0 : -1; + + if ((!endOnInput) && (unlikely(outputSize == 0))) + return (*ip == 0 ? 1 : -1); + + if ((endOnInput) && unlikely(srcSize == 0)) + return -1; + + /* Main Loop : decode sequences */ + while (1) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned int const token = *ip++; + length = token>>ML_BITS; + + /* ip < iend before the increment */ + assert(!endOnInput || ip <= iend); + + /* + * A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough + * space, enter the shortcut and copy 16 bytes on behalf + * of the literals (in the fast mode, only 8 bytes can be + * safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes + * in a similar manner; but we ensure that there's enough + * space in the output for those 18 bytes earlier, upon + * entering the shortcut (in other words, there is a + * combined check for both stages). + * + * The & in the likely() below is intentionally not && so that + * some compilers can produce better parallelized runtime code + */ + if ((endOnInput ? length != RUN_MASK : length <= 8) + /* + * strictly "less than" on input, to re-enter + * the loop with at least one byte + */ + && likely((endOnInput ? ip < shortiend : 1) & + (op <= shortoend))) { + /* Copy the literals */ + memcpy(op, ip, endOnInput ? 16 : 8); + op += length; ip += length; + + /* + * The second stage: + * prepare for match copying, decode full info. + * If it doesn't work out, the info won't be wasted. + */ + length = token & ML_MASK; /* match length */ + offset = LZ4_readLE16(ip); + ip += 2; + match = op - offset; + assert(match <= op); /* check overflow */ + + /* Do not deal with overlapping matches. */ + if ((length != ML_MASK) && + (offset >= 8) && + (dict == withPrefix64k || match >= lowPrefix)) { + /* Copy the match. */ + memcpy(op + 0, match + 0, 8); + memcpy(op + 8, match + 8, 8); + memcpy(op + 16, match + 16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* + * The second stage didn't work out, but the info + * is ready. Propel it right to the point of match + * copying. + */ + goto _copy_match; + } + + /* decode literal length */ + if (length == RUN_MASK) { + unsigned int s; + + if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) { + /* overflow detection */ + goto _output_error; + } + do { + s = *ip++; + length += s; + } while (likely(endOnInput + ? ip < iend - RUN_MASK + : 1) & (s == 255)); + + if ((safeDecode) + && unlikely((uptrval)(op) + + length < (uptrval)(op))) { + /* overflow detection */ + goto _output_error; + } + if ((safeDecode) + && unlikely((uptrval)(ip) + + length < (uptrval)(ip))) { + /* overflow detection */ + goto _output_error; + } + } + + /* copy literals */ + cpy = op + length; + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + + if (((endOnInput) && ((cpy > oend - MFLIMIT) + || (ip + length > iend - (2 + 1 + LASTLITERALS)))) + || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) { + if (partialDecoding) { + if (cpy > oend) { + /* + * Partial decoding : + * stop in the middle of literal segment + */ + cpy = oend; + length = oend - op; + } + if ((endOnInput) + && (ip + length > iend)) { + /* + * Error : + * read attempt beyond + * end of input buffer + */ + goto _output_error; + } + } else { + if ((!endOnInput) + && (cpy != oend)) { + /* + * Error : + * block decoding must + * stop exactly there + */ + goto _output_error; + } + if ((endOnInput) + && ((ip + length != iend) + || (cpy > oend))) { + /* + * Error : + * input must be consumed + */ + goto _output_error; + } + } + + /* + * supports overlapping memory regions; only matters + * for in-place decompression scenarios + */ + memmove(op, ip, length); + ip += length; + op += length; + + /* Necessarily EOF, due to parsing restrictions */ + if (!partialDecoding || (cpy == oend)) + break; + } else { + /* may overwrite up to WILDCOPYLENGTH beyond cpy */ + LZ4_wildCopy(op, ip, cpy); + ip += length; + op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); + ip += 2; + match = op - offset; + + /* get matchlength */ + length = token & ML_MASK; + +_copy_match: + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { + /* Error : offset outside buffers */ + goto _output_error; + } + + /* costs ~1%; silence an msan warning when offset == 0 */ + /* + * note : when partialDecoding, there is no guarantee that + * at least 4 bytes remain available in output buffer + */ + if (!partialDecoding) { + assert(oend > op); + assert(oend - op >= 4); + + LZ4_write32(op, (U32)offset); + } + + if (length == ML_MASK) { + unsigned int s; + + do { + s = *ip++; + + if ((endOnInput) && (ip > iend - LASTLITERALS)) + goto _output_error; + + length += s; + } while (s == 255); + + if ((safeDecode) + && unlikely( + (uptrval)(op) + length < (uptrval)op)) { + /* overflow detection */ + goto _output_error; + } + } + + length += MINMATCH; + + /* match starting within external dictionary */ + if ((dict == usingExtDict) && (match < lowPrefix)) { + if (unlikely(op + length > oend - LASTLITERALS)) { + /* doesn't respect parsing restriction */ + if (!partialDecoding) + goto _output_error; + length = min(length, (size_t)(oend - op)); + } + + if (length <= (size_t)(lowPrefix - match)) { + /* + * match fits entirely within external + * dictionary : just copy + */ + memmove(op, dictEnd - (lowPrefix - match), + length); + op += length; + } else { + /* + * match stretches into both external + * dictionary and current block + */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { + /* overlap copy */ + BYTE * const endOfMatch = op + restSize; + const BYTE *copyFrom = lowPrefix; + + while (op < endOfMatch) + *op++ = *copyFrom++; + } else { + memcpy(op, lowPrefix, restSize); + op += restSize; + } + } + continue; + } + + /* copy match within block */ + cpy = op + length; + + /* + * partialDecoding : + * may not respect endBlock parsing restrictions + */ + assert(op <= oend); + if (partialDecoding && + (cpy > oend - MATCH_SAFEGUARD_DISTANCE)) { + size_t const mlen = min(length, (size_t)(oend - op)); + const BYTE * const matchEnd = match + mlen; + BYTE * const copyEnd = op + mlen; + + if (matchEnd > op) { + /* overlap copy */ + while (op < copyEnd) + *op++ = *match++; + } else { + memcpy(op, match, mlen); + } + op = copyEnd; + if (op == oend) + break; + continue; + } + + if (unlikely(offset < 8)) { + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += inc32table[offset]; + memcpy(op + 4, match, 4); + match -= dec64table[offset]; + } else { + LZ4_copy8(op, match); + match += 8; + } + + op += 8; + + if (unlikely(cpy > oend - MATCH_SAFEGUARD_DISTANCE)) { + BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1); + + if (cpy > oend - LASTLITERALS) { + /* + * Error : last LASTLITERALS bytes + * must be literals (uncompressed) + */ + goto _output_error; + } + + if (op < oCopyLimit) { + LZ4_wildCopy(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op < cpy) + *op++ = *match++; + } else { + LZ4_copy8(op, match); + if (length > 16) + LZ4_wildCopy(op + 8, match + 8, cpy); + } + op = cpy; /* wildcopy correction */ + } + + /* end of decoding */ + if (endOnInput) { + /* Nb of output bytes decoded */ + return (int) (((char *)op) - dst); + } else { + /* Nb of input bytes read */ + return (int) (((const char *)ip) - src); + } + + /* Overflow error detected */ _output_error: - return (int) (-(((const char*)ip)-source))-1; + return (int) (-(((const char *)ip) - src)) - 1; +} + +int LZ4_decompress_safe(const char *source, char *dest, + int compressedSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, + compressedSize, maxDecompressedSize, + endOnInputSize, decode_full_block, + noDict, (BYTE *)dest, NULL, 0); +} + +int LZ4_decompress_safe_partial(const char *src, char *dst, + int compressedSize, int targetOutputSize, int dstCapacity) +{ + dstCapacity = min(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity, + endOnInputSize, partial_decode, + noDict, (BYTE *)dst, NULL, 0); } diff --git a/lib/lz4_wrapper.c b/lib/lz4_wrapper.c index ebcb5c09a2..0d2a3655a8 100644 --- a/lib/lz4_wrapper.c +++ b/lib/lz4_wrapper.c @@ -11,27 +11,6 @@ #include <asm/unaligned.h> #include <u-boot/lz4.h> -static u16 LZ4_readLE16(const void *src) -{ - return get_unaligned_le16(src); -} -static void LZ4_copy4(void *dst, const void *src) -{ - put_unaligned(get_unaligned((const u32 *)src), (u32 *)dst); -} -static void LZ4_copy8(void *dst, const void *src) -{ - put_unaligned(get_unaligned((const u64 *)src), (u64 *)dst); -} - -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -typedef int32_t S32; -typedef uint64_t U64; - -#define FORCE_INLINE static inline __attribute__((always_inline)) - /* lz4.c is unaltered (except removing unrelated code) from github.com/Cyan4973/lz4. */ #include "lz4.c" /* #include for inlining, do not link! */ @@ -112,7 +91,7 @@ int ulz4fn(const void *src, size_t srcn, void *dst, size_t *dstn) /* constant folding essential, do not touch params! */ ret = LZ4_decompress_generic(in, out, block_size, end - out, endOnInputSize, - full, 0, noDict, out, NULL, 0); + decode_full_block, noDict, out, NULL, 0); if (ret < 0) { ret = -EPROTO; /* decompression error */ break;
Update the LZ4 compression module based on LZ4 v1.8.3 in order to use the newest LZ4_decompress_safe_partial() which can now decode exactly the nb of bytes requested. Signed-off-by: Huang Jianan <jnhuang95@gmail.com> --- include/u-boot/lz4.h | 49 ++++ lib/lz4.c | 679 +++++++++++++++++++++++++++++++------------ lib/lz4_wrapper.c | 23 +- 3 files changed, 536 insertions(+), 215 deletions(-)