Message ID | 20200325142906.221248-5-npiggin@gmail.com |
---|---|
State | New |
Headers | show |
Series | FWNMI follow up patches | expand |
On Thu, 26 Mar 2020 00:29:06 +1000 Nicholas Piggin <npiggin@gmail.com> wrote: > Try to be tolerant of FWNMI delivery errors if the machine check had been > recovered by the host. > > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > --- > hw/ppc/spapr_events.c | 27 ++++++++++++++++++++++----- > 1 file changed, 22 insertions(+), 5 deletions(-) > > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c > index c8964eb25d..b90ecb8afe 100644 > --- a/hw/ppc/spapr_events.c > +++ b/hw/ppc/spapr_events.c > @@ -833,13 +833,25 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered) > /* get rtas addr from fdt */ > rtas_addr = spapr_get_rtas_addr(); > if (!rtas_addr) { > - error_report( > + if (!recovered) { > + error_report( > "FWNMI: Unable to deliver machine check to guest: rtas_addr not found."); > - qemu_system_guest_panicked(NULL); > + qemu_system_guest_panicked(NULL); > + } else { > + warn_report( > +"FWNMI: Unable to deliver machine check to guest: rtas_addr not found. " > +"Machine check recovered."); > + } > g_free(ext_elog); > return; > } > > + /* > + * Must not set interlock if the MCE does not get delivered to the guest > + * in the error case above. > + */ It is a bit confusing to read "must not set interlock" and to see the interlock being set the line below IM-non-native-speaker-HO... also a small clarification of the outcome of taking the interlock without delivering the MCE could help people who aren't familiar with FWNMI to avoid doing bad things. What about something like the following ? /* * By taking the interlock, we assume that the MCE will be * delivered to the guest. CAUTION: don't add anything that * could prevent the MCE to be delivered after this line, * otherwise the guest won't be able to release the interlock * and ultimately hang/crash? */ > + spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; > + For improved paranoia, this could even be done just before calling ppc_cpu_do_fwnmi_machine_check(). Anyway, the change is good enough so: Reviewed-by: Greg Kurz <groug@kaod.org> > stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET, > env->gpr[3]); > cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET + > @@ -876,9 +888,15 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > * that CPU called "ibm,nmi-interlock") > */ > if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) { > - error_report( > + if (!recovered) { > + error_report( > "FWNMI: Unable to deliver machine check to guest: nested machine check."); > - qemu_system_guest_panicked(NULL); > + qemu_system_guest_panicked(NULL); > + } else { > + warn_report( > +"FWNMI: Unable to deliver machine check to guest: nested machine check. " > +"Machine check recovered."); > + } > return; > } > qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond); > @@ -906,7 +924,6 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > warn_report("Received a fwnmi while migration was in progress"); > } > > - spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; > spapr_mce_dispatch_elog(cpu, recovered); > } >
On Wed, Mar 25, 2020 at 07:13:32PM +0100, Greg Kurz wrote: > On Thu, 26 Mar 2020 00:29:06 +1000 > Nicholas Piggin <npiggin@gmail.com> wrote: > > > Try to be tolerant of FWNMI delivery errors if the machine check had been > > recovered by the host. > > > > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > > --- > > hw/ppc/spapr_events.c | 27 ++++++++++++++++++++++----- > > 1 file changed, 22 insertions(+), 5 deletions(-) > > > > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c > > index c8964eb25d..b90ecb8afe 100644 > > --- a/hw/ppc/spapr_events.c > > +++ b/hw/ppc/spapr_events.c > > @@ -833,13 +833,25 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered) > > /* get rtas addr from fdt */ > > rtas_addr = spapr_get_rtas_addr(); > > if (!rtas_addr) { > > - error_report( > > + if (!recovered) { > > + error_report( > > "FWNMI: Unable to deliver machine check to guest: rtas_addr not found."); > > - qemu_system_guest_panicked(NULL); > > + qemu_system_guest_panicked(NULL); > > + } else { > > + warn_report( > > +"FWNMI: Unable to deliver machine check to guest: rtas_addr not found. " > > +"Machine check recovered."); > > + } > > g_free(ext_elog); > > return; > > } > > > > + /* > > + * Must not set interlock if the MCE does not get delivered to the guest > > + * in the error case above. > > + */ > > It is a bit confusing to read "must not set interlock" and to see the > interlock being set the line below IM-non-native-speaker-HO... also > a small clarification of the outcome of taking the interlock without > delivering the MCE could help people who aren't familiar with FWNMI > to avoid doing bad things. That's a good point. That comment made sense to me (and presumably to Nick) because it directly answers a question I had on an earlier draft (which moved this without comment). But someone without that context (say, future me) could well find that confusing. > > What about something like the following ? > > /* > * By taking the interlock, we assume that the MCE will be > * delivered to the guest. CAUTION: don't add anything that > * could prevent the MCE to be delivered after this line, > * otherwise the guest won't be able to release the interlock > * and ultimately hang/crash? > */ I've substituted your comment inline. > > > + spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; > > + > > For improved paranoia, this could even be done just before calling > ppc_cpu_do_fwnmi_machine_check(). > > Anyway, the change is good enough so: > > Reviewed-by: Greg Kurz <groug@kaod.org> > > > stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET, > > env->gpr[3]); > > cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET + > > @@ -876,9 +888,15 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > > * that CPU called "ibm,nmi-interlock") > > */ > > if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) { > > - error_report( > > + if (!recovered) { > > + error_report( > > "FWNMI: Unable to deliver machine check to guest: nested machine check."); > > - qemu_system_guest_panicked(NULL); > > + qemu_system_guest_panicked(NULL); > > + } else { > > + warn_report( > > +"FWNMI: Unable to deliver machine check to guest: nested machine check. " > > +"Machine check recovered."); > > + } > > return; > > } > > qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond); > > @@ -906,7 +924,6 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > > warn_report("Received a fwnmi while migration was in progress"); > > } > > > > - spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; > > spapr_mce_dispatch_elog(cpu, recovered); > > } > > >
On Thu, Mar 26, 2020 at 12:29:06AM +1000, Nicholas Piggin wrote: > Try to be tolerant of FWNMI delivery errors if the machine check had been > recovered by the host. > > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Applied to ppc-for-5.0. > --- > hw/ppc/spapr_events.c | 27 ++++++++++++++++++++++----- > 1 file changed, 22 insertions(+), 5 deletions(-) > > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c > index c8964eb25d..b90ecb8afe 100644 > --- a/hw/ppc/spapr_events.c > +++ b/hw/ppc/spapr_events.c > @@ -833,13 +833,25 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered) > /* get rtas addr from fdt */ > rtas_addr = spapr_get_rtas_addr(); > if (!rtas_addr) { > - error_report( > + if (!recovered) { > + error_report( > "FWNMI: Unable to deliver machine check to guest: rtas_addr not found."); > - qemu_system_guest_panicked(NULL); > + qemu_system_guest_panicked(NULL); > + } else { > + warn_report( > +"FWNMI: Unable to deliver machine check to guest: rtas_addr not found. " > +"Machine check recovered."); > + } > g_free(ext_elog); > return; > } > > + /* > + * Must not set interlock if the MCE does not get delivered to the guest > + * in the error case above. > + */ > + spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; > + > stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET, > env->gpr[3]); > cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET + > @@ -876,9 +888,15 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > * that CPU called "ibm,nmi-interlock") > */ > if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) { > - error_report( > + if (!recovered) { > + error_report( > "FWNMI: Unable to deliver machine check to guest: nested machine check."); > - qemu_system_guest_panicked(NULL); > + qemu_system_guest_panicked(NULL); > + } else { > + warn_report( > +"FWNMI: Unable to deliver machine check to guest: nested machine check. " > +"Machine check recovered."); > + } > return; > } > qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond); > @@ -906,7 +924,6 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > warn_report("Received a fwnmi while migration was in progress"); > } > > - spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; > spapr_mce_dispatch_elog(cpu, recovered); > } >
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c index c8964eb25d..b90ecb8afe 100644 --- a/hw/ppc/spapr_events.c +++ b/hw/ppc/spapr_events.c @@ -833,13 +833,25 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered) /* get rtas addr from fdt */ rtas_addr = spapr_get_rtas_addr(); if (!rtas_addr) { - error_report( + if (!recovered) { + error_report( "FWNMI: Unable to deliver machine check to guest: rtas_addr not found."); - qemu_system_guest_panicked(NULL); + qemu_system_guest_panicked(NULL); + } else { + warn_report( +"FWNMI: Unable to deliver machine check to guest: rtas_addr not found. " +"Machine check recovered."); + } g_free(ext_elog); return; } + /* + * Must not set interlock if the MCE does not get delivered to the guest + * in the error case above. + */ + spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; + stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET, env->gpr[3]); cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET + @@ -876,9 +888,15 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) * that CPU called "ibm,nmi-interlock") */ if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) { - error_report( + if (!recovered) { + error_report( "FWNMI: Unable to deliver machine check to guest: nested machine check."); - qemu_system_guest_panicked(NULL); + qemu_system_guest_panicked(NULL); + } else { + warn_report( +"FWNMI: Unable to deliver machine check to guest: nested machine check. " +"Machine check recovered."); + } return; } qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond); @@ -906,7 +924,6 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) warn_report("Received a fwnmi while migration was in progress"); } - spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; spapr_mce_dispatch_elog(cpu, recovered); }
Try to be tolerant of FWNMI delivery errors if the machine check had been recovered by the host. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- hw/ppc/spapr_events.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-)