Message ID | 20240705031502.9041-1-liwang@redhat.com |
---|---|
State | Accepted |
Headers | show |
Series | perf_event_open: improve the memory leak detection | expand |
On Fri, Jul 5, 2024 at 11:15 AM Li Wang <liwang@redhat.com> wrote: > The goal is to add more robust memory leak detection by periodically > sampling memory availability during the test run and checking for > significant decreases in available memory. > > To avoid false postive: > perf_event_open03.c:95: TFAIL: Likely kernel memory leak detected > > Signed-off-by: Li Wang <liwang@redhat.com> > --- > .../perf_event_open/perf_event_open03.c | 32 +++++++++++++++---- > 1 file changed, 26 insertions(+), 6 deletions(-) > > diff --git a/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > b/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > index 7dd31d3d2..1aab43e82 100644 > --- a/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > +++ b/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > @@ -26,6 +26,7 @@ > const int iterations = 12000000; > static int fd = -1; > static int runtime; > +static int sample; > > static void setup(void) > { > @@ -77,22 +78,41 @@ static void check_progress(int i) > > static void run(void) > { > - long diff; > + long diff, diff_total, mem_avail, mem_avail_prev; > int i; > > - diff = SAFE_READ_MEMINFO("MemAvailable:"); > + sample = 0; > + diff_total = 0; > + > + mem_avail_prev = SAFE_READ_MEMINFO("MemAvailable:"); > tst_timer_start(CLOCK_MONOTONIC); > > /* leak about 100MB of RAM */ > for (i = 0; i < iterations; i++) { > ioctl(fd, PERF_EVENT_IOC_SET_FILTER, "filter,0/0@abcd"); > check_progress(i); > - } > > - diff -= SAFE_READ_MEMINFO("MemAvailable:"); > + /* > + * Every 1200000 iterations, calculate the difference in > memory > + * availability. If the difference is greater than 10 * > 1024 (10MB), > + * increment the sample counter and log the event. > + */ > + if ((i % 1200000) == 0) { > + mem_avail = SAFE_READ_MEMINFO("MemAvailable:"); > + diff = mem_avail_prev - mem_avail; > + diff_total += diff; > + > + if (diff > 20 * 1024) { > + sample++; > + tst_res(TINFO, "MemAvailable decreased by > %ld kB at iteration %d", diff, i); > + } > + > + mem_avail_prev = mem_avail; > + } > + } > > - if (diff > 50 * 1024) > - tst_res(TFAIL, "Likely kernel memory leak detected"); > + if ((sample > 5) || (diff_total > 100 * 1024)) > Here maybe diff_toal is still the interference factor in reporting false positives, I'm hesitating to remove or keep it. Or we could use it first to see how the test performs in the large-scale CI testing. > + tst_res(TFAIL, "Likely kernel memory leak detected, total > decrease: %ld kB", diff_total); > else > tst_res(TPASS, "No memory leak found"); > } > -- > 2.45.2 > > > -- > Mailing list info: https://lists.linux.it/listinfo/ltp > >
Hi Cyril, All, Any comments on this method? On Fri, Jul 5, 2024 at 11:15 AM Li Wang <liwang@redhat.com> wrote: > The goal is to add more robust memory leak detection by periodically > sampling memory availability during the test run and checking for > significant decreases in available memory. > > To avoid false postive: > perf_event_open03.c:95: TFAIL: Likely kernel memory leak detected > > Signed-off-by: Li Wang <liwang@redhat.com> > --- > .../perf_event_open/perf_event_open03.c | 32 +++++++++++++++---- > 1 file changed, 26 insertions(+), 6 deletions(-) > > diff --git a/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > b/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > index 7dd31d3d2..1aab43e82 100644 > --- a/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > +++ b/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > @@ -26,6 +26,7 @@ > const int iterations = 12000000; > static int fd = -1; > static int runtime; > +static int sample; > > static void setup(void) > { > @@ -77,22 +78,41 @@ static void check_progress(int i) > > static void run(void) > { > - long diff; > + long diff, diff_total, mem_avail, mem_avail_prev; > int i; > > - diff = SAFE_READ_MEMINFO("MemAvailable:"); > + sample = 0; > + diff_total = 0; > + > + mem_avail_prev = SAFE_READ_MEMINFO("MemAvailable:"); > tst_timer_start(CLOCK_MONOTONIC); > > /* leak about 100MB of RAM */ > for (i = 0; i < iterations; i++) { > ioctl(fd, PERF_EVENT_IOC_SET_FILTER, "filter,0/0@abcd"); > check_progress(i); > - } > > - diff -= SAFE_READ_MEMINFO("MemAvailable:"); > + /* > + * Every 1200000 iterations, calculate the difference in > memory > + * availability. If the difference is greater than 10 * > 1024 (10MB), > + * increment the sample counter and log the event. > + */ > + if ((i % 1200000) == 0) { > + mem_avail = SAFE_READ_MEMINFO("MemAvailable:"); > + diff = mem_avail_prev - mem_avail; > + diff_total += diff; > + > + if (diff > 20 * 1024) { > + sample++; > + tst_res(TINFO, "MemAvailable decreased by > %ld kB at iteration %d", diff, i); > + } > + > + mem_avail_prev = mem_avail; > + } > + } > > - if (diff > 50 * 1024) > - tst_res(TFAIL, "Likely kernel memory leak detected"); > + if ((sample > 5) || (diff_total > 100 * 1024)) > + tst_res(TFAIL, "Likely kernel memory leak detected, total > decrease: %ld kB", diff_total); > else > tst_res(TPASS, "No memory leak found"); > } > -- > 2.45.2 > > > -- > Mailing list info: https://lists.linux.it/listinfo/ltp > >
Hi! > + /* > + * Every 1200000 iterations, calculate the difference in memory > + * availability. If the difference is greater than 10 * 1024 (10MB), > + * increment the sample counter and log the event. > + */ > + if ((i % 1200000) == 0) { > + mem_avail = SAFE_READ_MEMINFO("MemAvailable:"); > + diff = mem_avail_prev - mem_avail; > + diff_total += diff; > + > + if (diff > 20 * 1024) { Shouldn't this be 10 * 1024 or possibly slightly less than 10 * 1024? > + sample++; > + tst_res(TINFO, "MemAvailable decreased by %ld kB at iteration %d", diff, i); > + } > + > + mem_avail_prev = mem_avail; > + } > + } > > - if (diff > 50 * 1024) > - tst_res(TFAIL, "Likely kernel memory leak detected"); > + if ((sample > 5) || (diff_total > 100 * 1024)) Maybe this can rather be if ((sample > 5) && (diff_total > 100 * 1024)) That means that the available memory has been eaten by something and that it happened more or less in a linear fashion when the program was running. > + tst_res(TFAIL, "Likely kernel memory leak detected, total decrease: %ld kB", diff_total); > else > tst_res(TPASS, "No memory leak found"); > } > -- > 2.45.2 >
Hi, I've tested the patch on an affected kernel and it still reproduces the leak. Small nit below, but otherwise: Reviewed-by: Martin Doucha <mdoucha@suse.cz> On 05. 07. 24 5:15, Li Wang wrote: > The goal is to add more robust memory leak detection by periodically > sampling memory availability during the test run and checking for > significant decreases in available memory. > > To avoid false postive: > perf_event_open03.c:95: TFAIL: Likely kernel memory leak detected > > Signed-off-by: Li Wang <liwang@redhat.com> > --- > .../perf_event_open/perf_event_open03.c | 32 +++++++++++++++---- > 1 file changed, 26 insertions(+), 6 deletions(-) > > diff --git a/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c b/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > index 7dd31d3d2..1aab43e82 100644 > --- a/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > +++ b/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c > @@ -26,6 +26,7 @@ > const int iterations = 12000000; > static int fd = -1; > static int runtime; > +static int sample; Does this variable need to be global? It's used only in run() and it gets reset to zero between test iterations. > > static void setup(void) > { > @@ -77,22 +78,41 @@ static void check_progress(int i) > > static void run(void) > { > - long diff; > + long diff, diff_total, mem_avail, mem_avail_prev; > int i; > > - diff = SAFE_READ_MEMINFO("MemAvailable:"); > + sample = 0; > + diff_total = 0; > + > + mem_avail_prev = SAFE_READ_MEMINFO("MemAvailable:"); > tst_timer_start(CLOCK_MONOTONIC); > > /* leak about 100MB of RAM */ > for (i = 0; i < iterations; i++) { > ioctl(fd, PERF_EVENT_IOC_SET_FILTER, "filter,0/0@abcd"); > check_progress(i); > - } > > - diff -= SAFE_READ_MEMINFO("MemAvailable:"); > + /* > + * Every 1200000 iterations, calculate the difference in memory > + * availability. If the difference is greater than 10 * 1024 (10MB), > + * increment the sample counter and log the event. > + */ > + if ((i % 1200000) == 0) { > + mem_avail = SAFE_READ_MEMINFO("MemAvailable:"); > + diff = mem_avail_prev - mem_avail; > + diff_total += diff; > + > + if (diff > 20 * 1024) { > + sample++; > + tst_res(TINFO, "MemAvailable decreased by %ld kB at iteration %d", diff, i); > + } > + > + mem_avail_prev = mem_avail; > + } > + } > > - if (diff > 50 * 1024) > - tst_res(TFAIL, "Likely kernel memory leak detected"); > + if ((sample > 5) || (diff_total > 100 * 1024)) > + tst_res(TFAIL, "Likely kernel memory leak detected, total decrease: %ld kB", diff_total); > else > tst_res(TPASS, "No memory leak found"); > }
On 17. 07. 24 10:31, Cyril Hrubis wrote: > Hi! >> + /* >> + * Every 1200000 iterations, calculate the difference in memory >> + * availability. If the difference is greater than 10 * 1024 (10MB), >> + * increment the sample counter and log the event. >> + */ >> + if ((i % 1200000) == 0) { >> + mem_avail = SAFE_READ_MEMINFO("MemAvailable:"); >> + diff = mem_avail_prev - mem_avail; >> + diff_total += diff; >> + >> + if (diff > 20 * 1024) { > > Shouldn't this be 10 * 1024 or possibly slightly less than 10 * 1024? This should be fine. The test leaks around 38MB between sample checks and over 350MB in a whole test run. > >> + sample++; >> + tst_res(TINFO, "MemAvailable decreased by %ld kB at iteration %d", diff, i); >> + } >> + >> + mem_avail_prev = mem_avail; >> + } >> + } >> >> - if (diff > 50 * 1024) >> - tst_res(TFAIL, "Likely kernel memory leak detected"); >> + if ((sample > 5) || (diff_total > 100 * 1024)) > > Maybe this can rather be if ((sample > 5) && (diff_total > 100 * 1024)) > > That means that the available memory has been eaten by something and > that it happened more or less in a linear fashion when the program was > running. Imagine that some other process releases 300MB of memory while the test is running. If you change the || to &&, you'll get a false negative in that case. The sampling approach will protect against such interference. That being said, if the available memory on your test system fluctuates by 100+MB during a test run that takes <10 seconds, I'd recommend investigating what's causing such fluctuation. On the test machine I used to verify this patch, I can see <10MB of difference before and after running the test on a fixed kernel. > >> + tst_res(TFAIL, "Likely kernel memory leak detected, total decrease: %ld kB", diff_total); >> else >> tst_res(TPASS, "No memory leak found"); >> } >> -- >> 2.45.2 >> >
Hi! > >> + /* > >> + * Every 1200000 iterations, calculate the difference in memory > >> + * availability. If the difference is greater than 10 * 1024 (10MB), > >> + * increment the sample counter and log the event. > >> + */ > >> + if ((i % 1200000) == 0) { > >> + mem_avail = SAFE_READ_MEMINFO("MemAvailable:"); > >> + diff = mem_avail_prev - mem_avail; > >> + diff_total += diff; > >> + > >> + if (diff > 20 * 1024) { > > > > Shouldn't this be 10 * 1024 or possibly slightly less than 10 * 1024? > > This should be fine. The test leaks around 38MB between sample checks > and over 350MB in a whole test run. Then we should fix the comment. > > > >> + sample++; > >> + tst_res(TINFO, "MemAvailable decreased by %ld kB at iteration %d", diff, i); > >> + } > >> + > >> + mem_avail_prev = mem_avail; > >> + } > >> + } > >> > >> - if (diff > 50 * 1024) > >> - tst_res(TFAIL, "Likely kernel memory leak detected"); > >> + if ((sample > 5) || (diff_total > 100 * 1024)) > > > > Maybe this can rather be if ((sample > 5) && (diff_total > 100 * 1024)) > > > > That means that the available memory has been eaten by something and > > that it happened more or less in a linear fashion when the program was > > running. > > Imagine that some other process releases 300MB of memory while the test > is running. If you change the || to &&, you'll get a false negative in > that case. The sampling approach will protect against such interference. > > That being said, if the available memory on your test system fluctuates > by 100+MB during a test run that takes <10 seconds, I'd recommend > investigating what's causing such fluctuation. On the test machine I > used to verify this patch, I can see <10MB of difference before and > after running the test on a fixed kernel. So shall we remove the diff_total completely?
On 18. 07. 24 17:20, Cyril Hrubis wrote: >>> Maybe this can rather be if ((sample > 5) && (diff_total > 100 * 1024)) >>> >>> That means that the available memory has been eaten by something and >>> that it happened more or less in a linear fashion when the program was >>> running. >> >> Imagine that some other process releases 300MB of memory while the test >> is running. If you change the || to &&, you'll get a false negative in >> that case. The sampling approach will protect against such interference. >> >> That being said, if the available memory on your test system fluctuates >> by 100+MB during a test run that takes <10 seconds, I'd recommend >> investigating what's causing such fluctuation. On the test machine I >> used to verify this patch, I can see <10MB of difference before and >> after running the test on a fixed kernel. > > So shall we remove the diff_total completely? No, diff_total is still useful because it checks for smaller leak than the sum of samples.
Hi Martin, Cyril, On Thu, Jul 18, 2024 at 11:25 PM Martin Doucha <mdoucha@suse.cz> wrote: > On 18. 07. 24 17:20, Cyril Hrubis wrote: > >>> Maybe this can rather be if ((sample > 5) && (diff_total > 100 * 1024)) > >>> > >>> That means that the available memory has been eaten by something and > >>> that it happened more or less in a linear fashion when the program was > >>> running. > >> > >> Imagine that some other process releases 300MB of memory while the test > >> is running. If you change the || to &&, you'll get a false negative in > >> that case. The sampling approach will protect against such interference. > >> > >> That being said, if the available memory on your test system fluctuates > >> by 100+MB during a test run that takes <10 seconds, I'd recommend > >> investigating what's causing such fluctuation. On the test machine I > >> used to verify this patch, I can see <10MB of difference before and > >> after running the test on a fixed kernel. > > > > So shall we remove the diff_total completely? > > No, diff_total is still useful because it checks for smaller leak than > the sum of samples. > Thanks so much for your confirmation and test. I pushed the patch with improvements as you suggested. Let's see how it goes in future tests.
diff --git a/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c b/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c index 7dd31d3d2..1aab43e82 100644 --- a/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c +++ b/testcases/kernel/syscalls/perf_event_open/perf_event_open03.c @@ -26,6 +26,7 @@ const int iterations = 12000000; static int fd = -1; static int runtime; +static int sample; static void setup(void) { @@ -77,22 +78,41 @@ static void check_progress(int i) static void run(void) { - long diff; + long diff, diff_total, mem_avail, mem_avail_prev; int i; - diff = SAFE_READ_MEMINFO("MemAvailable:"); + sample = 0; + diff_total = 0; + + mem_avail_prev = SAFE_READ_MEMINFO("MemAvailable:"); tst_timer_start(CLOCK_MONOTONIC); /* leak about 100MB of RAM */ for (i = 0; i < iterations; i++) { ioctl(fd, PERF_EVENT_IOC_SET_FILTER, "filter,0/0@abcd"); check_progress(i); - } - diff -= SAFE_READ_MEMINFO("MemAvailable:"); + /* + * Every 1200000 iterations, calculate the difference in memory + * availability. If the difference is greater than 10 * 1024 (10MB), + * increment the sample counter and log the event. + */ + if ((i % 1200000) == 0) { + mem_avail = SAFE_READ_MEMINFO("MemAvailable:"); + diff = mem_avail_prev - mem_avail; + diff_total += diff; + + if (diff > 20 * 1024) { + sample++; + tst_res(TINFO, "MemAvailable decreased by %ld kB at iteration %d", diff, i); + } + + mem_avail_prev = mem_avail; + } + } - if (diff > 50 * 1024) - tst_res(TFAIL, "Likely kernel memory leak detected"); + if ((sample > 5) || (diff_total > 100 * 1024)) + tst_res(TFAIL, "Likely kernel memory leak detected, total decrease: %ld kB", diff_total); else tst_res(TPASS, "No memory leak found"); }
The goal is to add more robust memory leak detection by periodically sampling memory availability during the test run and checking for significant decreases in available memory. To avoid false postive: perf_event_open03.c:95: TFAIL: Likely kernel memory leak detected Signed-off-by: Li Wang <liwang@redhat.com> --- .../perf_event_open/perf_event_open03.c | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-)