Message ID | 20211029083908.87931-2-npiggin@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | [1/2] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/github-powerpc_ppctests | success | Successfully ran 8 jobs. |
snowpatch_ozlabs/github-powerpc_selftests | success | Successfully ran 8 jobs. |
snowpatch_ozlabs/github-powerpc_sparse | success | Successfully ran 4 jobs. |
snowpatch_ozlabs/github-powerpc_clang | success | Successfully ran 7 jobs. |
snowpatch_ozlabs/github-powerpc_kernel_qemu | success | Successfully ran 24 jobs. |
Le 29/10/2021 à 10:39, Nicholas Piggin a écrit : > There is a deadlock with the console_owner lock and the wd_smp_lock: > > CPU x takes the console_owner lock > CPU y takes a watchdog timer interrupt and takes __wd_smp_lock > CPU x takes a watchdog timer interrupt and spins on __wd_smp_lock > CPU y detects a deadlock, tries to print something and spins on console_owner > -> deadlock > > Change the watchdog locking scheme so wd_smp_lock protects the watchdog > internal data, but "reporting" (printing, issuing NMI IPIs, taking any > action outside of watchdog) uses a non-waiting exclusion. If a CPU detects > a problem but can not take the reporting lock, it just returns because > something else is already reporting. It will try again at some point. > > Typically hard lockup watchdog report usefulness is not impacted due to > failure to spewing a large enough amount of data in as short a time as > possible, but by messages getting garbled. > > Laurent debugged this and found the deadlock, and this patch is based on > his general approach to avoid expensive operations while holding the lock. > With the addition of the reporting exclusion. > > Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> > [np: rework to add reporting exclusion update changelog] > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > --- > arch/powerpc/kernel/watchdog.c | 76 ++++++++++++++++++++++++++-------- > 1 file changed, 59 insertions(+), 17 deletions(-) > > diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c > index 4bb7c8e371a2..69a475aa0f44 100644 > --- a/arch/powerpc/kernel/watchdog.c > +++ b/arch/powerpc/kernel/watchdog.c > @@ -85,10 +85,32 @@ static DEFINE_PER_CPU(u64, wd_timer_tb); > > /* SMP checker bits */ > static unsigned long __wd_smp_lock; > +static unsigned long __wd_reporting; > static cpumask_t wd_smp_cpus_pending; > static cpumask_t wd_smp_cpus_stuck; > static u64 wd_smp_last_reset_tb; > > +/* > + * Try to take the exclusive watchdog action / NMI IPI / printing lock. > + * wd_smp_lock must be held. If this fails, we should return and wait > + * for the watchdog to kick in again (or another CPU to trigger it). > + */ > +static bool wd_try_report(void) > +{ > + if (__wd_reporting) > + return false; > + __wd_reporting = 1; > + return true; > +} > + > +/* End printing after successful wd_try_report. wd_smp_lock not required. */ > +static void wd_end_reporting(void) > +{ > + smp_mb(); /* End printing "critical section" */ > + WARN_ON_ONCE(__wd_reporting == 0); > + WRITE_ONCE(__wd_reporting, 0); > +} > + > static inline void wd_smp_lock(unsigned long *flags) > { > /* > @@ -131,10 +153,10 @@ static void wd_lockup_ipi(struct pt_regs *regs) > /* Do not panic from here because that can recurse into NMI IPI layer */ > } > > -static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) > +static void set_cpu_stuck(int cpu, u64 tb) > { > - cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask); > - cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask); > + cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); > + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); > if (cpumask_empty(&wd_smp_cpus_pending)) { > wd_smp_last_reset_tb = tb; > cpumask_andnot(&wd_smp_cpus_pending, > @@ -142,10 +164,6 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) > &wd_smp_cpus_stuck); > } > } > -static void set_cpu_stuck(int cpu, u64 tb) > -{ > - set_cpumask_stuck(cpumask_of(cpu), tb); > -} > > static void watchdog_smp_panic(int cpu, u64 tb) > { > @@ -160,6 +178,9 @@ static void watchdog_smp_panic(int cpu, u64 tb) > goto out; > if (cpumask_weight(&wd_smp_cpus_pending) == 0) > goto out; > + if (!wd_try_report()) > + goto out; > + wd_smp_unlock(&flags); > > pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n", > cpu, cpumask_pr_args(&wd_smp_cpus_pending)); > @@ -172,24 +193,32 @@ static void watchdog_smp_panic(int cpu, u64 tb) > * Try to trigger the stuck CPUs, unless we are going to > * get a backtrace on all of them anyway. > */ > - for_each_cpu(c, &wd_smp_cpus_pending) { > + for_each_online_cpu(c) { > if (c == cpu) > continue; > + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) ^ c cpu is the reporting CPU, c is the target here. > + continue; > + wd_smp_lock(&flags); > + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { ^ again c > + wd_smp_unlock(&flags); > + continue; > + } > + /* Take the stuck CPU out of the watch group */ > + set_cpu_stuck(cpu, tb); ^ c > + wd_smp_unlock(&flags); > + > smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); > } > } > > - /* Take the stuck CPUs out of the watch group */ > - set_cpumask_stuck(&wd_smp_cpus_pending, tb); > - > - wd_smp_unlock(&flags); > - > if (sysctl_hardlockup_all_cpu_backtrace) > trigger_allbutself_cpu_backtrace(); > > if (hardlockup_panic) > nmi_panic(NULL, "Hard LOCKUP"); > > + wd_end_reporting(); > + > return; > > out: > @@ -203,8 +232,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) > struct pt_regs *regs = get_irq_regs(); > unsigned long flags; > > - wd_smp_lock(&flags); > - > pr_emerg("CPU %d became unstuck TB:%lld\n", > cpu, tb); > print_irqtrace_events(current); > @@ -213,6 +240,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) > else > dump_stack(); > > + wd_smp_lock(&flags); > cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); > wd_smp_unlock(&flags); > } else { > @@ -291,8 +319,17 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) > wd_smp_unlock(&flags); > return 0; > } > + if (!wd_try_report()) { > + wd_smp_unlock(&flags); > + /* Couldn't report, try again in 100ms */ > + mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000); > + return 0; > + } > + > set_cpu_stuck(cpu, tb); > > + wd_smp_unlock(&flags); > + > pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n", > cpu, (void *)regs->nip); > pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n", > @@ -302,14 +339,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) > print_irqtrace_events(current); > show_regs(regs); > > - wd_smp_unlock(&flags); > - > if (sysctl_hardlockup_all_cpu_backtrace) > trigger_allbutself_cpu_backtrace(); > > if (hardlockup_panic) > nmi_panic(regs, "Hard LOCKUP"); > + > + wd_end_reporting(); > } > + /* > + * We are okay to change DEC in soft_nmi_interrupt because the masked > + * handler has marked a DEC as pending, so the timer interrupt will be > + * replayed as soon as local irqs are enabled again. > + */ > if (wd_panic_timeout_tb < 0x7fffffff) > mtspr(SPRN_DEC, wd_panic_timeout_tb); > >
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 4bb7c8e371a2..69a475aa0f44 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -85,10 +85,32 @@ static DEFINE_PER_CPU(u64, wd_timer_tb); /* SMP checker bits */ static unsigned long __wd_smp_lock; +static unsigned long __wd_reporting; static cpumask_t wd_smp_cpus_pending; static cpumask_t wd_smp_cpus_stuck; static u64 wd_smp_last_reset_tb; +/* + * Try to take the exclusive watchdog action / NMI IPI / printing lock. + * wd_smp_lock must be held. If this fails, we should return and wait + * for the watchdog to kick in again (or another CPU to trigger it). + */ +static bool wd_try_report(void) +{ + if (__wd_reporting) + return false; + __wd_reporting = 1; + return true; +} + +/* End printing after successful wd_try_report. wd_smp_lock not required. */ +static void wd_end_reporting(void) +{ + smp_mb(); /* End printing "critical section" */ + WARN_ON_ONCE(__wd_reporting == 0); + WRITE_ONCE(__wd_reporting, 0); +} + static inline void wd_smp_lock(unsigned long *flags) { /* @@ -131,10 +153,10 @@ static void wd_lockup_ipi(struct pt_regs *regs) /* Do not panic from here because that can recurse into NMI IPI layer */ } -static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) +static void set_cpu_stuck(int cpu, u64 tb) { - cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask); - cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask); + cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); if (cpumask_empty(&wd_smp_cpus_pending)) { wd_smp_last_reset_tb = tb; cpumask_andnot(&wd_smp_cpus_pending, @@ -142,10 +164,6 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) &wd_smp_cpus_stuck); } } -static void set_cpu_stuck(int cpu, u64 tb) -{ - set_cpumask_stuck(cpumask_of(cpu), tb); -} static void watchdog_smp_panic(int cpu, u64 tb) { @@ -160,6 +178,9 @@ static void watchdog_smp_panic(int cpu, u64 tb) goto out; if (cpumask_weight(&wd_smp_cpus_pending) == 0) goto out; + if (!wd_try_report()) + goto out; + wd_smp_unlock(&flags); pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n", cpu, cpumask_pr_args(&wd_smp_cpus_pending)); @@ -172,24 +193,32 @@ static void watchdog_smp_panic(int cpu, u64 tb) * Try to trigger the stuck CPUs, unless we are going to * get a backtrace on all of them anyway. */ - for_each_cpu(c, &wd_smp_cpus_pending) { + for_each_online_cpu(c) { if (c == cpu) continue; + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) + continue; + wd_smp_lock(&flags); + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { + wd_smp_unlock(&flags); + continue; + } + /* Take the stuck CPU out of the watch group */ + set_cpu_stuck(cpu, tb); + wd_smp_unlock(&flags); + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); } } - /* Take the stuck CPUs out of the watch group */ - set_cpumask_stuck(&wd_smp_cpus_pending, tb); - - wd_smp_unlock(&flags); - if (sysctl_hardlockup_all_cpu_backtrace) trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); + wd_end_reporting(); + return; out: @@ -203,8 +232,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) struct pt_regs *regs = get_irq_regs(); unsigned long flags; - wd_smp_lock(&flags); - pr_emerg("CPU %d became unstuck TB:%lld\n", cpu, tb); print_irqtrace_events(current); @@ -213,6 +240,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) else dump_stack(); + wd_smp_lock(&flags); cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); wd_smp_unlock(&flags); } else { @@ -291,8 +319,17 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) wd_smp_unlock(&flags); return 0; } + if (!wd_try_report()) { + wd_smp_unlock(&flags); + /* Couldn't report, try again in 100ms */ + mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000); + return 0; + } + set_cpu_stuck(cpu, tb); + wd_smp_unlock(&flags); + pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n", cpu, (void *)regs->nip); pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n", @@ -302,14 +339,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) print_irqtrace_events(current); show_regs(regs); - wd_smp_unlock(&flags); - if (sysctl_hardlockup_all_cpu_backtrace) trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); + + wd_end_reporting(); } + /* + * We are okay to change DEC in soft_nmi_interrupt because the masked + * handler has marked a DEC as pending, so the timer interrupt will be + * replayed as soon as local irqs are enabled again. + */ if (wd_panic_timeout_tb < 0x7fffffff) mtspr(SPRN_DEC, wd_panic_timeout_tb);