Message ID | 20230906135951.795581-2-william.roche@oracle.com |
---|---|
State | New |
Headers | show |
Series | Qemu crashes on VM migration after an handled memory error | expand |
On 06/09/2023 14:59, “William Roche wrote: > From: William Roche <william.roche@oracle.com> > > A memory page poisoned from the hypervisor level is no longer readable. > Thus, it is now treated as a zero-page for the ram saving migration phase. > > The migration of a VM will crash Qemu when it tries to read the > memory address space and stumbles on the poisoned page with a similar > stack trace: > > Program terminated with signal SIGBUS, Bus error. > #0 _mm256_loadu_si256 > #1 buffer_zero_avx2 > #2 select_accel_fn > #3 buffer_is_zero > #4 save_zero_page_to_file > #5 save_zero_page > #6 ram_save_target_page_legacy > #7 ram_save_host_page > #8 ram_find_and_save_block > #9 ram_save_iterate > #10 qemu_savevm_state_iterate > #11 migration_iteration_run > #12 migration_thread > #13 qemu_thread_start > > Fix it by considering poisoned pages as if they were zero-pages for > the migration copy. This fix also works with underlying large pages, > taking into account the RAMBlock segment "page-size". > > Signed-off-by: William Roche <william.roche@oracle.com> You forgot to CC the maintainers; Adding them now ./scripts/get_maintainer.pl is your friend for the next version :) > --- > accel/kvm/kvm-all.c | 14 ++++++++++++++ > accel/stubs/kvm-stub.c | 5 +++++ > include/sysemu/kvm.h | 10 ++++++++++ > migration/ram.c | 3 ++- > 4 files changed, 31 insertions(+), 1 deletion(-) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index 2ba7521695..24a7709495 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -1152,6 +1152,20 @@ static void kvm_unpoison_all(void *param) > } > } > > +bool kvm_hwpoisoned_page(RAMBlock *block, void *offset) > +{ > + HWPoisonPage *pg; > + ram_addr_t ram_addr = (ram_addr_t) offset; > + > + QLIST_FOREACH(pg, &hwpoison_page_list, list) { > + if ((ram_addr >= pg->ram_addr) && > + (ram_addr - pg->ram_addr < block->page_size)) { > + return true; > + } > + } > + return false; > +} > + > void kvm_hwpoison_page_add(ram_addr_t ram_addr) > { > HWPoisonPage *page; > diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c > index 235dc661bc..c0a31611df 100644 > --- a/accel/stubs/kvm-stub.c > +++ b/accel/stubs/kvm-stub.c > @@ -133,3 +133,8 @@ uint32_t kvm_dirty_ring_size(void) > { > return 0; > } > + > +bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr) > +{ > + return false; > +} > diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h > index ebdca41052..a2196e9e6b 100644 > --- a/include/sysemu/kvm.h > +++ b/include/sysemu/kvm.h > @@ -580,4 +580,14 @@ bool kvm_arch_cpu_check_are_resettable(void); > bool kvm_dirty_ring_enabled(void); > > uint32_t kvm_dirty_ring_size(void); > + > +/** > + * kvm_hwpoisoned_page - indicate if the given page is poisoned > + * @block: memory block of the given page > + * @ram_addr: offset of the page > + * > + * Returns: true: page is poisoned > + * false: page not yet poisoned > + */ > +bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr); > #endif > diff --git a/migration/ram.c b/migration/ram.c > index 9040d66e61..48d875b12d 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -1145,7 +1145,8 @@ static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, > uint8_t *p = block->host + offset; > int len = 0; > > - if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { > + if ((kvm_enabled() && kvm_hwpoisoned_page(block, (void *)offset)) || > + buffer_is_zero(p, TARGET_PAGE_SIZE)) { > len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); > qemu_put_byte(file, 0); > len += 1;
On Wed, Sep 06, 2023 at 03:19:32PM +0100, Joao Martins wrote: > On 06/09/2023 14:59, “William Roche wrote: > > From: William Roche <william.roche@oracle.com> > > > > A memory page poisoned from the hypervisor level is no longer readable. > > Thus, it is now treated as a zero-page for the ram saving migration phase. > > > > The migration of a VM will crash Qemu when it tries to read the > > memory address space and stumbles on the poisoned page with a similar > > stack trace: > > > > Program terminated with signal SIGBUS, Bus error. > > #0 _mm256_loadu_si256 > > #1 buffer_zero_avx2 > > #2 select_accel_fn > > #3 buffer_is_zero > > #4 save_zero_page_to_file > > #5 save_zero_page > > #6 ram_save_target_page_legacy > > #7 ram_save_host_page > > #8 ram_find_and_save_block > > #9 ram_save_iterate > > #10 qemu_savevm_state_iterate > > #11 migration_iteration_run > > #12 migration_thread > > #13 qemu_thread_start > > > > Fix it by considering poisoned pages as if they were zero-pages for > > the migration copy. This fix also works with underlying large pages, > > taking into account the RAMBlock segment "page-size". > > > > Signed-off-by: William Roche <william.roche@oracle.com> > > You forgot to CC the maintainers; Adding them now > > ./scripts/get_maintainer.pl is your friend for the next version :) > > > --- > > accel/kvm/kvm-all.c | 14 ++++++++++++++ > > accel/stubs/kvm-stub.c | 5 +++++ > > include/sysemu/kvm.h | 10 ++++++++++ > > migration/ram.c | 3 ++- > > 4 files changed, 31 insertions(+), 1 deletion(-) > > > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > > index 2ba7521695..24a7709495 100644 > > --- a/accel/kvm/kvm-all.c > > +++ b/accel/kvm/kvm-all.c > > @@ -1152,6 +1152,20 @@ static void kvm_unpoison_all(void *param) > > } > > } > > > > +bool kvm_hwpoisoned_page(RAMBlock *block, void *offset) > > +{ > > + HWPoisonPage *pg; > > + ram_addr_t ram_addr = (ram_addr_t) offset; > > + > > + QLIST_FOREACH(pg, &hwpoison_page_list, list) { > > + if ((ram_addr >= pg->ram_addr) && > > + (ram_addr - pg->ram_addr < block->page_size)) { Just a note.. Probably fine for now to reuse block page size, but IIUC the right thing to do is to fetch it from the signal info (in QEMU's sigbus_handler()) of kernel_siginfo.si_addr_lsb. At least for x86 I think that stores the "shift" of covered poisoned page (one needs to track the Linux handling of VM_FAULT_HWPOISON_LARGE for a huge page, though.. not aware of any man page for that). It'll then work naturally when Linux huge pages will start to support sub-huge-page-size poisoning someday. We can definitely leave that for later. > > + return true; > > + } > > + } > > + return false; > > +} > > + > > void kvm_hwpoison_page_add(ram_addr_t ram_addr) > > { > > HWPoisonPage *page; > > diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c > > index 235dc661bc..c0a31611df 100644 > > --- a/accel/stubs/kvm-stub.c > > +++ b/accel/stubs/kvm-stub.c > > @@ -133,3 +133,8 @@ uint32_t kvm_dirty_ring_size(void) > > { > > return 0; > > } > > + > > +bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr) > > +{ > > + return false; > > +} > > diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h > > index ebdca41052..a2196e9e6b 100644 > > --- a/include/sysemu/kvm.h > > +++ b/include/sysemu/kvm.h > > @@ -580,4 +580,14 @@ bool kvm_arch_cpu_check_are_resettable(void); > > bool kvm_dirty_ring_enabled(void); > > > > uint32_t kvm_dirty_ring_size(void); > > + > > +/** > > + * kvm_hwpoisoned_page - indicate if the given page is poisoned > > + * @block: memory block of the given page > > + * @ram_addr: offset of the page > > + * > > + * Returns: true: page is poisoned > > + * false: page not yet poisoned > > + */ > > +bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr); > > #endif > > diff --git a/migration/ram.c b/migration/ram.c > > index 9040d66e61..48d875b12d 100644 > > --- a/migration/ram.c > > +++ b/migration/ram.c > > @@ -1145,7 +1145,8 @@ static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, > > uint8_t *p = block->host + offset; > > int len = 0; > > > > - if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { > > + if ((kvm_enabled() && kvm_hwpoisoned_page(block, (void *)offset)) || Can we move this out of zero page handling? Zero detection is not guaranteed to always be the 1st thing to do when processing a guest page. Currently it'll already skip either rdma or when compression enabled, so it'll keep crashing there. Perhaps at the entry of ram_save_target_page_legacy()? > > + buffer_is_zero(p, TARGET_PAGE_SIZE)) { > > len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); > > qemu_put_byte(file, 0); > > len += 1; >
On 9/6/23 17:16, Peter Xu wrote: > > Just a note.. > > Probably fine for now to reuse block page size, but IIUC the right thing to > do is to fetch it from the signal info (in QEMU's sigbus_handler()) of > kernel_siginfo.si_addr_lsb. > > At least for x86 I think that stores the "shift" of covered poisoned page > (one needs to track the Linux handling of VM_FAULT_HWPOISON_LARGE for a > huge page, though.. not aware of any man page for that). It'll then work > naturally when Linux huge pages will start to support sub-huge-page-size > poisoning someday. We can definitely leave that for later. > I totally agree with that ! >>> --- a/migration/ram.c >>> +++ b/migration/ram.c >>> @@ -1145,7 +1145,8 @@ static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, >>> uint8_t *p = block->host + offset; >>> int len = 0; >>> >>> - if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { >>> + if ((kvm_enabled() && kvm_hwpoisoned_page(block, (void *)offset)) || > > Can we move this out of zero page handling? Zero detection is not > guaranteed to always be the 1st thing to do when processing a guest page. > Currently it'll already skip either rdma or when compression enabled, so > it'll keep crashing there. > > Perhaps at the entry of ram_save_target_page_legacy()? Right, as expected, using migration compression with poisoned pages crashes even with this fix... The difficulty I see to place the poisoned page verification on the entry of ram_save_target_page_legacy() is what to do to skip the found poison page(s) if any ? Should I continue to treat them as zero pages written with save_zero_page_to_file ? Or should I consider the case of an ongoing compression use and create a new code compressing an empty page with save_compress_page() ? And what about an RDMA memory region impacted by a memory error ? This is an important aspect. Does anyone know how this situation is dealt with ? And how it should be handled in Qemu ? -- Thanks, William.
On 06/09/2023 22:29, William Roche wrote: > On 9/6/23 17:16, Peter Xu wrote: >> >> Just a note.. >> >> Probably fine for now to reuse block page size, but IIUC the right thing to >> do is to fetch it from the signal info (in QEMU's sigbus_handler()) of >> kernel_siginfo.si_addr_lsb. >> >> At least for x86 I think that stores the "shift" of covered poisoned page >> (one needs to track the Linux handling of VM_FAULT_HWPOISON_LARGE for a >> huge page, though.. not aware of any man page for that). It'll then work >> naturally when Linux huge pages will start to support sub-huge-page-size >> poisoning someday. We can definitely leave that for later. >> > > I totally agree with that ! > Provided this bug affects all qemu versions thus far, perhaps should be a follow up series, to make the changer easier to bring into stable tree. > >>>> --- a/migration/ram.c >>>> +++ b/migration/ram.c >>>> @@ -1145,7 +1145,8 @@ static int save_zero_page_to_file(PageSearchStatus >>>> *pss, QEMUFile *file, >>>> uint8_t *p = block->host + offset; >>>> int len = 0; >>>> - if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { >>>> + if ((kvm_enabled() && kvm_hwpoisoned_page(block, (void *)offset)) || >> >> Can we move this out of zero page handling? Zero detection is not >> guaranteed to always be the 1st thing to do when processing a guest page. >> Currently it'll already skip either rdma or when compression enabled, so >> it'll keep crashing there. >> >> Perhaps at the entry of ram_save_target_page_legacy()? > > Right, as expected, using migration compression with poisoned pages crashes even > with this fix... > > The difficulty I see to place the poisoned page verification on the > entry of ram_save_target_page_legacy() is what to do to skip the found poison > page(s) if any ? > > Should I continue to treat them as zero pages written with > save_zero_page_to_file ? MCE had already been forward to the guest, so guest is supposed to not be using the page (nor rely on its contents). Hence destination ought to just see a zero page. So what you said seems like the best course of action. > Or should I consider the case of an ongoing compression > use and create a new code compressing an empty page with save_compress_page() ? > The compress code looks to be a tentative compression (not guaranteed IIUC), so I am not sure it needs any more logic that just adding at the top of ram_save_target_page_legacy() as Peter suggested? > And what about an RDMA memory region impacted by a memory error ? > This is an important aspect. > Does anyone know how this situation is dealt with ? And how it should be handled > in Qemu ? > If you refer to guest RDMA MRs that is just guest RAM, not sure we are even aware of those from qemu. But if you refer to the RDMA transport that sits below the Qemu file (or rather acts as an implementation of QemuFile), so handling in ram_save_target_page_legacy() already seems to cover it. > -- > Thanks, > William.
On Sat, Sep 09, 2023 at 03:57:44PM +0100, Joao Martins wrote: > > Should I continue to treat them as zero pages written with > > save_zero_page_to_file ? > > MCE had already been forward to the guest, so guest is supposed to not be using > the page (nor rely on its contents). Hence destination ought to just see a zero > page. So what you said seems like the best course of action. > > > Or should I consider the case of an ongoing compression > > use and create a new code compressing an empty page with save_compress_page() ? > > > The compress code looks to be a tentative compression (not guaranteed IIUC), so > I am not sure it needs any more logic that just adding at the top of > ram_save_target_page_legacy() as Peter suggested? > > > And what about an RDMA memory region impacted by a memory error ? > > This is an important aspect. > > Does anyone know how this situation is dealt with ? And how it should be handled > > in Qemu ? > > > > If you refer to guest RDMA MRs that is just guest RAM, not sure we are even > aware of those from qemu. But if you refer to the RDMA transport that sits below > the Qemu file (or rather acts as an implementation of QemuFile), so handling in > ram_save_target_page_legacy() already seems to cover it. I'm also not familiar enough with RDMA, but it looks tricky indeed. AFAIU it's leveraging RDMA_CONTROL_COMPRESS for zero pages for now (with RDMACompress.value==0), so it doesn't seem to be using generic migration protocols. If we want to fix all places well, one way to consider is to introduce migration_buffer_is_zero(), which can be a wrapper for buffer_is_zero() by default, but also returns true for poisoned pages before reading the buffer. Then we use it in all three places: - For compression, in do_compress_ram_page() - For RDMA, in qemu_rdma_write_one() - For generic migration, in save_zero_page_to_file() (your current patch) I suppose then all cases will be fixed. We need to make sure we'll always use migration_buffer_is_zero() as the 1st thing to call when QEMU wants to migrate a target page. Maybe it'll worth a comment above that function. Thanks,
On Mon, Sep 11, 2023 at 03:48:38PM -0400, Peter Xu wrote: > On Sat, Sep 09, 2023 at 03:57:44PM +0100, Joao Martins wrote: > > > Should I continue to treat them as zero pages written with > > > save_zero_page_to_file ? > > > > MCE had already been forward to the guest, so guest is supposed to not be using > > the page (nor rely on its contents). Hence destination ought to just see a zero > > page. So what you said seems like the best course of action. > > > > > Or should I consider the case of an ongoing compression > > > use and create a new code compressing an empty page with save_compress_page() ? > > > > > The compress code looks to be a tentative compression (not guaranteed IIUC), so > > I am not sure it needs any more logic that just adding at the top of > > ram_save_target_page_legacy() as Peter suggested? > > > > > And what about an RDMA memory region impacted by a memory error ? > > > This is an important aspect. > > > Does anyone know how this situation is dealt with ? And how it should be handled > > > in Qemu ? > > > > > > > If you refer to guest RDMA MRs that is just guest RAM, not sure we are even > > aware of those from qemu. But if you refer to the RDMA transport that sits below > > the Qemu file (or rather acts as an implementation of QemuFile), so handling in > > ram_save_target_page_legacy() already seems to cover it. > > I'm also not familiar enough with RDMA, but it looks tricky indeed. AFAIU > it's leveraging RDMA_CONTROL_COMPRESS for zero pages for now (with > RDMACompress.value==0), so it doesn't seem to be using generic migration > protocols. > > If we want to fix all places well, one way to consider is to introduce > migration_buffer_is_zero(), which can be a wrapper for buffer_is_zero() by > default, but also returns true for poisoned pages before reading the > buffer. Then we use it in all three places: > > - For compression, in do_compress_ram_page() > - For RDMA, in qemu_rdma_write_one() Ah, this may not be enough.. sorry. It seems this is only one path that RDMA will use to save a target page, for (!rdma->pin_all || !block->is_ram_block) && !block->remote_keys[chunk]. RDMA seems to also possible to merge buffers if virtually continuous (qemu_rdma_buffer_mergable()), so IIUC it may not trigger an immediate access to the guest page until later if it finds continuous pages and skip even more logic. I suspect that's also problematic for poisoned pages so we should not allow any merged buffer to contain a poisoned page. Not sure how complicated will it be to fix rdma specifically, copy again two rdma developers. One option is we state the issue in rdma and fix non-rdma first. Looks like rdma needs its own fix anyway. > - For generic migration, in save_zero_page_to_file() (your current patch) > > I suppose then all cases will be fixed. We need to make sure we'll always > use migration_buffer_is_zero() as the 1st thing to call when QEMU wants to > migrate a target page. Maybe it'll worth a comment above that function. > > Thanks, > > -- > Peter Xu
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 2ba7521695..24a7709495 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1152,6 +1152,20 @@ static void kvm_unpoison_all(void *param) } } +bool kvm_hwpoisoned_page(RAMBlock *block, void *offset) +{ + HWPoisonPage *pg; + ram_addr_t ram_addr = (ram_addr_t) offset; + + QLIST_FOREACH(pg, &hwpoison_page_list, list) { + if ((ram_addr >= pg->ram_addr) && + (ram_addr - pg->ram_addr < block->page_size)) { + return true; + } + } + return false; +} + void kvm_hwpoison_page_add(ram_addr_t ram_addr) { HWPoisonPage *page; diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 235dc661bc..c0a31611df 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -133,3 +133,8 @@ uint32_t kvm_dirty_ring_size(void) { return 0; } + +bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr) +{ + return false; +} diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index ebdca41052..a2196e9e6b 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -580,4 +580,14 @@ bool kvm_arch_cpu_check_are_resettable(void); bool kvm_dirty_ring_enabled(void); uint32_t kvm_dirty_ring_size(void); + +/** + * kvm_hwpoisoned_page - indicate if the given page is poisoned + * @block: memory block of the given page + * @ram_addr: offset of the page + * + * Returns: true: page is poisoned + * false: page not yet poisoned + */ +bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr); #endif diff --git a/migration/ram.c b/migration/ram.c index 9040d66e61..48d875b12d 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1145,7 +1145,8 @@ static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, uint8_t *p = block->host + offset; int len = 0; - if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { + if ((kvm_enabled() && kvm_hwpoisoned_page(block, (void *)offset)) || + buffer_is_zero(p, TARGET_PAGE_SIZE)) { len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); qemu_put_byte(file, 0); len += 1;