Message ID | 20170425102009.9978-2-kai.heng.feng@canonical.com |
---|---|
State | New |
Headers | show |
On 25/04/17 11:20, Kai-Heng Feng wrote: > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > > BugLink: https://bugs.launchpad.net/bugs/1686061 > > On Intel hardware, native_play_dead() uses mwait_play_dead() by > default and only falls back to the other methods if that fails. > That also happens during resume from hibernation, when the restore > (boot) kernel runs disable_nonboot_cpus() to take all of the CPUs > except for the boot one offline. > > However, that is problematic, because the address passed to > __monitor() in mwait_play_dead() is likely to be written to in the > last phase of hibernate image restoration and that causes the "dead" > CPU to start executing instructions again. Unfortunately, the page > containing the address in that CPU's instruction pointer may not be > valid any more at that point. > > First, that page may have been overwritten with image kernel memory > contents already, so the instructions the CPU attempts to execute may > simply be invalid. Second, the page tables previously used by that > CPU may have been overwritten by image kernel memory contents, so the > address in its instruction pointer is impossible to resolve then. > > A report from Varun Koyyalagunta and investigation carried out by > Chen Yu show that the latter sometimes happens in practice. > > To prevent it from happening, temporarily change the smp_ops.play_dead > pointer during resume from hibernation so that it points to a special > "play dead" routine which uses hlt_play_dead() and avoids the > inadvertent "revivals" of "dead" CPUs this way. > > A slightly unpleasant consequence of this change is that if the > system is hibernated with one or more CPUs offline, it will generally > draw more power after resume than it did before hibernation, because > the physical state entered by CPUs via hlt_play_dead() is higher-power > than the mwait_play_dead() one in the majority of cases. It is > possible to work around this, but it is unclear how much of a problem > that's going to be in practice, so the workaround will be implemented > later if it turns out to be necessary. > > Link: https://bugzilla.kernel.org/show_bug.cgi?id=106371 > Reported-by: Varun Koyyalagunta <cpudebug@centtech.com> > Original-by: Chen Yu <yu.c.chen@intel.com> > Tested-by: Chen Yu <yu.c.chen@intel.com> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > Acked-by: Ingo Molnar <mingo@kernel.org> > (cherry picked from commit 406f992e4a372dafbe3c2cff7efbb2002a5c8ebd) > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com> > --- > arch/x86/include/asm/smp.h | 1 + > arch/x86/kernel/smpboot.c | 2 +- > arch/x86/power/cpu.c | 30 ++++++++++++++++++++++++++++++ > kernel/power/hibernate.c | 7 ++++++- > kernel/power/power.h | 2 ++ > 5 files changed, 40 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h > index f12ceef70ce2..55560e637dfd 100644 > --- a/arch/x86/include/asm/smp.h > +++ b/arch/x86/include/asm/smp.h > @@ -147,6 +147,7 @@ int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); > int native_cpu_disable(void); > int common_cpu_die(unsigned int cpu); > void native_cpu_die(unsigned int cpu); > +void hlt_play_dead(void); > void native_play_dead(void); > void play_dead_common(void); > void wbinvd_on_cpu(int cpu); > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index afa110370045..54ad7c1d87ad 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -1636,7 +1636,7 @@ static inline void mwait_play_dead(void) > } > } > > -static inline void hlt_play_dead(void) > +void hlt_play_dead(void) > { > if (__this_cpu_read(cpu_info.x86) >= 4) > wbinvd(); > diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c > index 9ab52791fed5..6f86dae52011 100644 > --- a/arch/x86/power/cpu.c > +++ b/arch/x86/power/cpu.c > @@ -12,6 +12,7 @@ > #include <linux/export.h> > #include <linux/smp.h> > #include <linux/perf_event.h> > +#include <linux/tboot.h> > > #include <asm/pgtable.h> > #include <asm/proto.h> > @@ -240,6 +241,35 @@ void notrace restore_processor_state(void) > EXPORT_SYMBOL(restore_processor_state); > #endif > > +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU) > +static void resume_play_dead(void) > +{ > + play_dead_common(); > + tboot_shutdown(TB_SHUTDOWN_WFS); > + hlt_play_dead(); > +} > + > +int hibernate_resume_nonboot_cpu_disable(void) > +{ > + void (*play_dead)(void) = smp_ops.play_dead; > + int ret; > + > + /* > + * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop > + * during hibernate image restoration, because it is likely that the > + * monitored address will be actually written to at that time and then > + * the "dead" CPU will attempt to execute instructions again, but the > + * address in its instruction pointer may not be possible to resolve > + * any more at that point (the page tables used by it previously may > + * have been overwritten by hibernate image data). > + */ > + smp_ops.play_dead = resume_play_dead; > + ret = disable_nonboot_cpus(); > + smp_ops.play_dead = play_dead; > + return ret; > +} > +#endif > + > /* > * When bsp_check() is called in hibernate and suspend, cpu hotplug > * is disabled already. So it's unnessary to handle race condition between > diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c > index 922617f4c854..7ee9011eee78 100644 > --- a/kernel/power/hibernate.c > +++ b/kernel/power/hibernate.c > @@ -410,6 +410,11 @@ int hibernation_snapshot(int platform_mode) > goto Close; > } > > +int __weak hibernate_resume_nonboot_cpu_disable(void) > +{ > + return disable_nonboot_cpus(); > +} > + > /** > * resume_target_kernel - Restore system state from a hibernation image. > * @platform_mode: Whether or not to use the platform driver. > @@ -434,7 +439,7 @@ static int resume_target_kernel(bool platform_mode) > if (error) > goto Cleanup; > > - error = disable_nonboot_cpus(); > + error = hibernate_resume_nonboot_cpu_disable(); > if (error) > goto Enable_cpus; > > diff --git a/kernel/power/power.h b/kernel/power/power.h > index caadb566e82b..8025a2baaac6 100644 > --- a/kernel/power/power.h > +++ b/kernel/power/power.h > @@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) > } > #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ > > +extern int hibernate_resume_nonboot_cpu_disable(void); > + > /* > * Keep some memory free so that I/O operations can succeed without paging > * [Might this be more than 4 MB?] > This is a clean upstream cherry pick of fix that has existed for a while and we're using in Yakkey+, and addresses the issue and has been also tested before it landed upstream. Acked-by: Colin Ian King <colin.king@canonical.com>
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f12ceef70ce2..55560e637dfd 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -147,6 +147,7 @@ int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); int common_cpu_die(unsigned int cpu); void native_cpu_die(unsigned int cpu); +void hlt_play_dead(void); void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index afa110370045..54ad7c1d87ad 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1636,7 +1636,7 @@ static inline void mwait_play_dead(void) } } -static inline void hlt_play_dead(void) +void hlt_play_dead(void) { if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 9ab52791fed5..6f86dae52011 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/smp.h> #include <linux/perf_event.h> +#include <linux/tboot.h> #include <asm/pgtable.h> #include <asm/proto.h> @@ -240,6 +241,35 @@ void notrace restore_processor_state(void) EXPORT_SYMBOL(restore_processor_state); #endif +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU) +static void resume_play_dead(void) +{ + play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); + hlt_play_dead(); +} + +int hibernate_resume_nonboot_cpu_disable(void) +{ + void (*play_dead)(void) = smp_ops.play_dead; + int ret; + + /* + * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop + * during hibernate image restoration, because it is likely that the + * monitored address will be actually written to at that time and then + * the "dead" CPU will attempt to execute instructions again, but the + * address in its instruction pointer may not be possible to resolve + * any more at that point (the page tables used by it previously may + * have been overwritten by hibernate image data). + */ + smp_ops.play_dead = resume_play_dead; + ret = disable_nonboot_cpus(); + smp_ops.play_dead = play_dead; + return ret; +} +#endif + /* * When bsp_check() is called in hibernate and suspend, cpu hotplug * is disabled already. So it's unnessary to handle race condition between diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 922617f4c854..7ee9011eee78 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -410,6 +410,11 @@ int hibernation_snapshot(int platform_mode) goto Close; } +int __weak hibernate_resume_nonboot_cpu_disable(void) +{ + return disable_nonboot_cpus(); +} + /** * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -434,7 +439,7 @@ static int resume_target_kernel(bool platform_mode) if (error) goto Cleanup; - error = disable_nonboot_cpus(); + error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; diff --git a/kernel/power/power.h b/kernel/power/power.h index caadb566e82b..8025a2baaac6 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) } #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ +extern int hibernate_resume_nonboot_cpu_disable(void); + /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?]