diff mbox series

[v3,1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

Message ID 20211124095500.98656-1-ganeshgr@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series [v3,1/2] powerpc/mce: Avoid using irq_work_queue() in realmode | expand

Commit Message

Ganesh Goudar Nov. 24, 2021, 9:54 a.m. UTC
In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.

V3:
* Fix build error.
  Reported by kernel test bot.
---
 arch/powerpc/include/asm/machdep.h       |  2 +
 arch/powerpc/include/asm/mce.h           |  2 +
 arch/powerpc/include/asm/paca.h          |  1 +
 arch/powerpc/kernel/mce.c                | 51 +++++++++++-------------
 arch/powerpc/kernel/time.c               |  3 ++
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c     | 31 +-------------
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 34 insertions(+), 58 deletions(-)

Comments

Nicholas Piggin Nov. 24, 2021, 1:03 p.m. UTC | #1
Excerpts from Ganesh Goudar's message of November 24, 2021 7:54 pm:
> In realmode mce handler we use irq_work_queue() to defer
> the processing of mce events, irq_work_queue() can only
> be called when translation is enabled because it touches
> memory outside RMA, hence we enable translation before
> calling irq_work_queue and disable on return, though it
> is not safe to do in realmode.
> 
> To avoid this, program the decrementer and call the event
> processing functions from timer handler.
> 
> Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
> ---
> V2:
> * Use arch_irq_work_raise to raise decrementer interrupt.
> * Avoid having atomic variable.
> 
> V3:
> * Fix build error.
>   Reported by kernel test bot.
> ---
>  arch/powerpc/include/asm/machdep.h       |  2 +
>  arch/powerpc/include/asm/mce.h           |  2 +
>  arch/powerpc/include/asm/paca.h          |  1 +
>  arch/powerpc/kernel/mce.c                | 51 +++++++++++-------------
>  arch/powerpc/kernel/time.c               |  3 ++
>  arch/powerpc/platforms/pseries/pseries.h |  1 +
>  arch/powerpc/platforms/pseries/ras.c     | 31 +-------------
>  arch/powerpc/platforms/pseries/setup.c   |  1 +
>  8 files changed, 34 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
> index 9c3c9f04129f..d22b222ba471 100644
> --- a/arch/powerpc/include/asm/machdep.h
> +++ b/arch/powerpc/include/asm/machdep.h
> @@ -99,6 +99,8 @@ struct machdep_calls {
>  	/* Called during machine check exception to retrive fixup address. */
>  	bool		(*mce_check_early_recovery)(struct pt_regs *regs);
>  
> +	void            (*machine_check_log_err)(void);
> +
>  	/* Motherboard/chipset features. This is a kind of general purpose
>  	 * hook used to control some machine specific features (like reset
>  	 * lines, chip power control, etc...).
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index 331d944280b8..6e306aaf58aa 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
>  unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>  extern void mce_common_process_ue(struct pt_regs *regs,
>  				  struct mce_error_info *mce_err);
> +void machine_check_raise_dec_intr(void);
>  int mce_register_notifier(struct notifier_block *nb);
>  int mce_unregister_notifier(struct notifier_block *nb);
> +void mce_run_late_handlers(void);
>  #ifdef CONFIG_PPC_BOOK3S_64
>  void flush_and_reload_slb(void);
>  void flush_erat(void);
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index dc05a862e72a..d463c796f7fa 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -280,6 +280,7 @@ struct paca_struct {
>  #endif
>  #ifdef CONFIG_PPC_BOOK3S_64
>  	struct mce_info *mce_info;
> +	u32 mces_to_process;
>  #endif /* CONFIG_PPC_BOOK3S_64 */
>  } ____cacheline_aligned;
>  
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index fd829f7f25a4..8e17f29472a0 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -28,19 +28,9 @@
>  
>  #include "setup.h"
>  
> -static void machine_check_process_queued_event(struct irq_work *work);
> -static void machine_check_ue_irq_work(struct irq_work *work);
>  static void machine_check_ue_event(struct machine_check_event *evt);
>  static void machine_process_ue_event(struct work_struct *work);
>  
> -static struct irq_work mce_event_process_work = {
> -        .func = machine_check_process_queued_event,
> -};
> -
> -static struct irq_work mce_ue_event_irq_work = {
> -	.func = machine_check_ue_irq_work,
> -};
> -
>  static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
>  
>  static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
> @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce,
>  	}
>  }
>  
> +/* Raise decrementer interrupt */
> +void machine_check_raise_dec_intr(void)
> +{
> +	arch_irq_work_raise();
> +}

It would be better if the name specifically related to irq work, which 
is more than just dec interrupt. It might be good to set mces_to_process
here as well.

I would name it something like mce_irq_work_queue, and the paca variable
to mce_pending_irq_work...


> +void mce_run_late_handlers(void)
> +{
> +	if (unlikely(local_paca->mces_to_process)) {
> +		if (ppc_md.machine_check_log_err)
> +			ppc_md.machine_check_log_err();
> +		machine_check_process_queued_event();
> +		machine_check_ue_work();
> +		local_paca->mces_to_process--;
> +	}
> +}

The problem with a counter is that you're clearing the irq work pending
in the timer interrupt, so you'll never call in here again to clear that
(until something else sets irq work).

But as far as I can see it does not need to be a counter, just a flag.
The machine check calls will process multiple events, right? (and the
current irq_work queue does not queue the same work multiple times).

Oh. That's actually bad, isn't it? Our irq work should be per-CPU
because the callbacks are mainly only operating on the local paca
queued events, so we have a longstanding bug there AFAIKS. Your patch
will solve it if everything is converted over.

> +
>  void machine_check_print_event_info(struct machine_check_event *evt,
>  				    bool user_mode, bool in_guest)
>  {
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index cae8f03a44fe..94c591b6f9d2 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -594,6 +594,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
>  
>  	if (test_irq_work_pending()) {
>  		clear_irq_work_pending();
> +#ifdef CONFIG_PPC_BOOK3S_64
> +		mce_run_late_handlers();
> +#endif

Maybe create a no-op inline function for others and call unconditionally 
here. I wonder if the name could be better, we have lots of handlers, of
varying earliness. real-mode, then virt mode NMI context, then IRQ 
context, then workqueue context.

mce_run_irq_context_handlers() might not be much better though.

>  		irq_work_run();
>  	}
>  
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 3544778e06d0..9cf0d33dfbf5 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -21,6 +21,7 @@ struct pt_regs;
>  extern int pSeries_system_reset_exception(struct pt_regs *regs);
>  extern int pSeries_machine_check_exception(struct pt_regs *regs);
>  extern long pseries_machine_check_realmode(struct pt_regs *regs);
> +void pSeries_machine_check_log_err(void);
>  
>  #ifdef CONFIG_SMP
>  extern void smp_init_pseries(void);
> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
> index 56092dccfdb8..8613f9cc5798 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -23,11 +23,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock);
>  
>  static int ras_check_exception_token;
>  
> -static void mce_process_errlog_event(struct irq_work *work);
> -static struct irq_work mce_errlog_process_work = {
> -	.func = mce_process_errlog_event,
> -};
> -
>  #define EPOW_SENSOR_TOKEN	9
>  #define EPOW_SENSOR_INDEX	0
>  
> @@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>  	error_type = mce_log->error_type;
>  
>  	disposition = mce_handle_err_realmode(disposition, error_type);
> -
> -	/*
> -	 * Enable translation as we will be accessing per-cpu variables
> -	 * in save_mce_event() which may fall outside RMO region, also
> -	 * leave it enabled because subsequently we will be queuing work
> -	 * to workqueues where again per-cpu variables accessed, besides
> -	 * fwnmi_release_errinfo() crashes when called in realmode on
> -	 * pseries.
> -	 * Note: All the realmode handling like flushing SLB entries for
> -	 *       SLB multihit is done by now.
> -	 */
>  out:
> -	msr = mfmsr();
> -	mtmsr(msr | MSR_IR | MSR_DR);
> -
>  	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>  					      disposition);
> -
> -	/*
> -	 * Queue irq work to log this rtas event later.
> -	 * irq_work_queue uses per-cpu variables, so do this in virt
> -	 * mode as well.
> -	 */
> -	irq_work_queue(&mce_errlog_process_work);
> -
> -	mtmsr(msr);
> -
>  	return disposition;
>  }
>  
>  /*
>   * Process MCE rtas errlog event.
>   */
> -static void mce_process_errlog_event(struct irq_work *work)
> +void pSeries_machine_check_log_err(void)
>  {
>  	struct rtas_error_log *err;
>  
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 8a62af5b9c24..9bdc487b8e35 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -1084,6 +1084,7 @@ define_machine(pseries) {
>  	.system_reset_exception = pSeries_system_reset_exception,
>  	.machine_check_early	= pseries_machine_check_realmode,
>  	.machine_check_exception = pSeries_machine_check_exception,
> +	.machine_check_log_err	= pSeries_machine_check_log_err,
>  #ifdef CONFIG_KEXEC_CORE
>  	.machine_kexec          = pSeries_machine_kexec,
>  	.kexec_cpu_down         = pseries_kexec_cpu_down,
> -- 
> 2.31.1
> 
>
Ganesh Goudar Jan. 17, 2022, 8:09 a.m. UTC | #2
On 11/24/21 18:33, Nicholas Piggin wrote:

> Excerpts from Ganesh Goudar's message of November 24, 2021 7:54 pm:
>> In realmode mce handler we use irq_work_queue() to defer
>> the processing of mce events, irq_work_queue() can only
>> be called when translation is enabled because it touches
>> memory outside RMA, hence we enable translation before
>> calling irq_work_queue and disable on return, though it
>> is not safe to do in realmode.
>>
>> To avoid this, program the decrementer and call the event
>> processing functions from timer handler.
>>
>> Signed-off-by: Ganesh Goudar<ganeshgr@linux.ibm.com>
>> ---
>> V2:
>> * Use arch_irq_work_raise to raise decrementer interrupt.
>> * Avoid having atomic variable.
>>
>> V3:
>> * Fix build error.
>>    Reported by kernel test bot.
>> ---
>>   arch/powerpc/include/asm/machdep.h       |  2 +
>>   arch/powerpc/include/asm/mce.h           |  2 +
>>   arch/powerpc/include/asm/paca.h          |  1 +
>>   arch/powerpc/kernel/mce.c                | 51 +++++++++++-------------
>>   arch/powerpc/kernel/time.c               |  3 ++
>>   arch/powerpc/platforms/pseries/pseries.h |  1 +
>>   arch/powerpc/platforms/pseries/ras.c     | 31 +-------------
>>   arch/powerpc/platforms/pseries/setup.c   |  1 +
>>   8 files changed, 34 insertions(+), 58 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
>> index 9c3c9f04129f..d22b222ba471 100644
>> --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -99,6 +99,8 @@ struct machdep_calls {
>>   	/* Called during machine check exception to retrive fixup address. */
>>   	bool		(*mce_check_early_recovery)(struct pt_regs *regs);
>>   
>> +	void            (*machine_check_log_err)(void);
>> +
>>   	/* Motherboard/chipset features. This is a kind of general purpose
>>   	 * hook used to control some machine specific features (like reset
>>   	 * lines, chip power control, etc...).
>> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
>> index 331d944280b8..6e306aaf58aa 100644
>> --- a/arch/powerpc/include/asm/mce.h
>> +++ b/arch/powerpc/include/asm/mce.h
>> @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
>>   unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>>   extern void mce_common_process_ue(struct pt_regs *regs,
>>   				  struct mce_error_info *mce_err);
>> +void machine_check_raise_dec_intr(void);
>>   int mce_register_notifier(struct notifier_block *nb);
>>   int mce_unregister_notifier(struct notifier_block *nb);
>> +void mce_run_late_handlers(void);
>>   #ifdef CONFIG_PPC_BOOK3S_64
>>   void flush_and_reload_slb(void);
>>   void flush_erat(void);
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index dc05a862e72a..d463c796f7fa 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -280,6 +280,7 @@ struct paca_struct {
>>   #endif
>>   #ifdef CONFIG_PPC_BOOK3S_64
>>   	struct mce_info *mce_info;
>> +	u32 mces_to_process;
>>   #endif /* CONFIG_PPC_BOOK3S_64 */
>>   } ____cacheline_aligned;
>>   
>> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
>> index fd829f7f25a4..8e17f29472a0 100644
>> --- a/arch/powerpc/kernel/mce.c
>> +++ b/arch/powerpc/kernel/mce.c
>> @@ -28,19 +28,9 @@
>>   
>>   #include "setup.h"
>>   
>> -static void machine_check_process_queued_event(struct irq_work *work);
>> -static void machine_check_ue_irq_work(struct irq_work *work);
>>   static void machine_check_ue_event(struct machine_check_event *evt);
>>   static void machine_process_ue_event(struct work_struct *work);
>>   
>> -static struct irq_work mce_event_process_work = {
>> -        .func = machine_check_process_queued_event,
>> -};
>> -
>> -static struct irq_work mce_ue_event_irq_work = {
>> -	.func = machine_check_ue_irq_work,
>> -};
>> -
>>   static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
>>   
>>   static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
>> @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce,
>>   	}
>>   }
>>   
>> +/* Raise decrementer interrupt */
>> +void machine_check_raise_dec_intr(void)
>> +{
>> +	arch_irq_work_raise();
>> +}
> It would be better if the name specifically related to irq work, which
> is more than just dec interrupt. It might be good to set mces_to_process
> here as well.

Sure

>
> I would name it something like mce_irq_work_queue, and the paca variable
> to mce_pending_irq_work...

Ok

>
>
>> +void mce_run_late_handlers(void)
>> +{
>> +	if (unlikely(local_paca->mces_to_process)) {
>> +		if (ppc_md.machine_check_log_err)
>> +			ppc_md.machine_check_log_err();
>> +		machine_check_process_queued_event();
>> +		machine_check_ue_work();
>> +		local_paca->mces_to_process--;
>> +	}
>> +}
> The problem with a counter is that you're clearing the irq work pending
> in the timer interrupt, so you'll never call in here again to clear that
> (until something else sets irq work).
>
> But as far as I can see it does not need to be a counter, just a flag.
> The machine check calls will process multiple events, right? (and the
> current irq_work queue does not queue the same work multiple times).

You are right, It can just be a flag.

>
> Oh. That's actually bad, isn't it? Our irq work should be per-CPU
> because the callbacks are mainly only operating on the local paca
> queued events, so we have a longstanding bug there AFAIKS. Your patch
> will solve it if everything is converted over.
>
>> +
>>   void machine_check_print_event_info(struct machine_check_event *evt,
>>   				    bool user_mode, bool in_guest)
>>   {
>> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
>> index cae8f03a44fe..94c591b6f9d2 100644
>> --- a/arch/powerpc/kernel/time.c
>> +++ b/arch/powerpc/kernel/time.c
>> @@ -594,6 +594,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
>>   
>>   	if (test_irq_work_pending()) {
>>   		clear_irq_work_pending();
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +		mce_run_late_handlers();
>> +#endif
> Maybe create a no-op inline function for others and call unconditionally
> here. I wonder if the name could be better, we have lots of handlers, of
> varying earliness. real-mode, then virt mode NMI context, then IRQ
> context, then workqueue context.
>
> mce_run_irq_context_handlers() might not be much better though.
>
>>   		irq_work_run();
>>   	}
>>   
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index 3544778e06d0..9cf0d33dfbf5 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -21,6 +21,7 @@ struct pt_regs;
>>   extern int pSeries_system_reset_exception(struct pt_regs *regs);
>>   extern int pSeries_machine_check_exception(struct pt_regs *regs);
>>   extern long pseries_machine_check_realmode(struct pt_regs *regs);
>> +void pSeries_machine_check_log_err(void);
>>   
>>   #ifdef CONFIG_SMP
>>   extern void smp_init_pseries(void);
>> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
>> index 56092dccfdb8..8613f9cc5798 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -23,11 +23,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock);
>>   
>>   static int ras_check_exception_token;
>>   
>> -static void mce_process_errlog_event(struct irq_work *work);
>> -static struct irq_work mce_errlog_process_work = {
>> -	.func = mce_process_errlog_event,
>> -};
>> -
>>   #define EPOW_SENSOR_TOKEN	9
>>   #define EPOW_SENSOR_INDEX	0
>>   
>> @@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>>   	error_type = mce_log->error_type;
>>   
>>   	disposition = mce_handle_err_realmode(disposition, error_type);
>> -
>> -	/*
>> -	 * Enable translation as we will be accessing per-cpu variables
>> -	 * in save_mce_event() which may fall outside RMO region, also
>> -	 * leave it enabled because subsequently we will be queuing work
>> -	 * to workqueues where again per-cpu variables accessed, besides
>> -	 * fwnmi_release_errinfo() crashes when called in realmode on
>> -	 * pseries.
>> -	 * Note: All the realmode handling like flushing SLB entries for
>> -	 *       SLB multihit is done by now.
>> -	 */
>>   out:
>> -	msr = mfmsr();
>> -	mtmsr(msr | MSR_IR | MSR_DR);
>> -
>>   	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>>   					      disposition);
>> -
>> -	/*
>> -	 * Queue irq work to log this rtas event later.
>> -	 * irq_work_queue uses per-cpu variables, so do this in virt
>> -	 * mode as well.
>> -	 */
>> -	irq_work_queue(&mce_errlog_process_work);
>> -
>> -	mtmsr(msr);
>> -
>>   	return disposition;
>>   }
>>   
>>   /*
>>    * Process MCE rtas errlog event.
>>    */
>> -static void mce_process_errlog_event(struct irq_work *work)
>> +void pSeries_machine_check_log_err(void)
>>   {
>>   	struct rtas_error_log *err;
>>   
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index 8a62af5b9c24..9bdc487b8e35 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -1084,6 +1084,7 @@ define_machine(pseries) {
>>   	.system_reset_exception = pSeries_system_reset_exception,
>>   	.machine_check_early	= pseries_machine_check_realmode,
>>   	.machine_check_exception = pSeries_machine_check_exception,
>> +	.machine_check_log_err	= pSeries_machine_check_log_err,
>>   #ifdef CONFIG_KEXEC_CORE
>>   	.machine_kexec          = pSeries_machine_kexec,
>>   	.kexec_cpu_down         = pseries_kexec_cpu_down,
>> -- 
>> 2.31.1
>>
>>
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 9c3c9f04129f..d22b222ba471 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@  struct machdep_calls {
 	/* Called during machine check exception to retrive fixup address. */
 	bool		(*mce_check_early_recovery)(struct pt_regs *regs);
 
+	void            (*machine_check_log_err)(void);
+
 	/* Motherboard/chipset features. This is a kind of general purpose
 	 * hook used to control some machine specific features (like reset
 	 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..6e306aaf58aa 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@  extern void machine_check_print_event_info(struct machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
 				  struct mce_error_info *mce_err);
+void machine_check_raise_dec_intr(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..d463c796f7fa 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@  struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct mce_info *mce_info;
+	u32 mces_to_process;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } ____cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..8e17f29472a0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@ 
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-        .func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-	.func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,12 @@  static void mce_set_error_info(struct machine_check_event *mce,
 	}
 }
 
+/* Raise decrementer interrupt */
+void machine_check_raise_dec_intr(void)
+{
+	arch_irq_work_raise();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@  void save_mce_event(struct pt_regs *regs, long handled,
 	if (mce->error_type == MCE_ERROR_TYPE_UE)
 		mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+	local_paca->mces_to_process++;
+
 	if (!addr)
 		return;
 
@@ -217,7 +215,7 @@  void release_mce_event(void)
 	get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
 	schedule_work(&mce_ue_event_work);
 }
@@ -239,7 +237,7 @@  static void machine_check_ue_event(struct machine_check_event *evt)
 	       evt, sizeof(*evt));
 
 	/* Queue work to process this event later. */
-	irq_work_queue(&mce_ue_event_irq_work);
+	machine_check_raise_dec_intr();
 }
 
 /*
@@ -249,7 +247,6 @@  void machine_check_queue_event(void)
 {
 	int index;
 	struct machine_check_event evt;
-	unsigned long msr;
 
 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
 		return;
@@ -263,20 +260,7 @@  void machine_check_queue_event(void)
 	memcpy(&local_paca->mce_info->mce_event_queue[index],
 	       &evt, sizeof(evt));
 
-	/*
-	 * Queue irq work to process this event later. Before
-	 * queuing the work enable translation for non radix LPAR,
-	 * as irq_work_queue may try to access memory outside RMO
-	 * region.
-	 */
-	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
-		msr = mfmsr();
-		mtmsr(msr | MSR_IR | MSR_DR);
-		irq_work_queue(&mce_event_process_work);
-		mtmsr(msr);
-	} else {
-		irq_work_queue(&mce_event_process_work);
-	}
+	machine_check_raise_dec_intr();
 }
 
 void mce_common_process_ue(struct pt_regs *regs,
@@ -338,7 +322,7 @@  static void machine_process_ue_event(struct work_struct *work)
  * process pending MCE event from the mce event queue. This function will be
  * called during syscall exit.
  */
-static void machine_check_process_queued_event(struct irq_work *work)
+static void machine_check_process_queued_event(void)
 {
 	int index;
 	struct machine_check_event *evt;
@@ -363,6 +347,17 @@  static void machine_check_process_queued_event(struct irq_work *work)
 	}
 }
 
+void mce_run_late_handlers(void)
+{
+	if (unlikely(local_paca->mces_to_process)) {
+		if (ppc_md.machine_check_log_err)
+			ppc_md.machine_check_log_err();
+		machine_check_process_queued_event();
+		machine_check_ue_work();
+		local_paca->mces_to_process--;
+	}
+}
+
 void machine_check_print_event_info(struct machine_check_event *evt,
 				    bool user_mode, bool in_guest)
 {
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index cae8f03a44fe..94c591b6f9d2 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -594,6 +594,9 @@  DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
 
 	if (test_irq_work_pending()) {
 		clear_irq_work_pending();
+#ifdef CONFIG_PPC_BOOK3S_64
+		mce_run_late_handlers();
+#endif
 		irq_work_run();
 	}
 
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 3544778e06d0..9cf0d33dfbf5 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -21,6 +21,7 @@  struct pt_regs;
 extern int pSeries_system_reset_exception(struct pt_regs *regs);
 extern int pSeries_machine_check_exception(struct pt_regs *regs);
 extern long pseries_machine_check_realmode(struct pt_regs *regs);
+void pSeries_machine_check_log_err(void);
 
 #ifdef CONFIG_SMP
 extern void smp_init_pseries(void);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 56092dccfdb8..8613f9cc5798 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -23,11 +23,6 @@  static DEFINE_SPINLOCK(ras_log_buf_lock);
 
 static int ras_check_exception_token;
 
-static void mce_process_errlog_event(struct irq_work *work);
-static struct irq_work mce_errlog_process_work = {
-	.func = mce_process_errlog_event,
-};
-
 #define EPOW_SENSOR_TOKEN	9
 #define EPOW_SENSOR_INDEX	0
 
@@ -729,40 +724,16 @@  static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 	error_type = mce_log->error_type;
 
 	disposition = mce_handle_err_realmode(disposition, error_type);
-
-	/*
-	 * Enable translation as we will be accessing per-cpu variables
-	 * in save_mce_event() which may fall outside RMO region, also
-	 * leave it enabled because subsequently we will be queuing work
-	 * to workqueues where again per-cpu variables accessed, besides
-	 * fwnmi_release_errinfo() crashes when called in realmode on
-	 * pseries.
-	 * Note: All the realmode handling like flushing SLB entries for
-	 *       SLB multihit is done by now.
-	 */
 out:
-	msr = mfmsr();
-	mtmsr(msr | MSR_IR | MSR_DR);
-
 	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
 					      disposition);
-
-	/*
-	 * Queue irq work to log this rtas event later.
-	 * irq_work_queue uses per-cpu variables, so do this in virt
-	 * mode as well.
-	 */
-	irq_work_queue(&mce_errlog_process_work);
-
-	mtmsr(msr);
-
 	return disposition;
 }
 
 /*
  * Process MCE rtas errlog event.
  */
-static void mce_process_errlog_event(struct irq_work *work)
+void pSeries_machine_check_log_err(void)
 {
 	struct rtas_error_log *err;
 
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 8a62af5b9c24..9bdc487b8e35 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -1084,6 +1084,7 @@  define_machine(pseries) {
 	.system_reset_exception = pSeries_system_reset_exception,
 	.machine_check_early	= pseries_machine_check_realmode,
 	.machine_check_exception = pSeries_machine_check_exception,
+	.machine_check_log_err	= pSeries_machine_check_log_err,
 #ifdef CONFIG_KEXEC_CORE
 	.machine_kexec          = pSeries_machine_kexec,
 	.kexec_cpu_down         = pseries_kexec_cpu_down,