diff mbox series

[5/5] powerpc/64s: Fix unrecoverable MCE calling async handler from NMI

Message ID 20211004145642.1331214-6-npiggin@gmail.com (mailing list archive)
State Accepted
Headers show
Series powerpc: various interrupt handling fixes | expand

Checks

Context Check Description
snowpatch_ozlabs/github-powerpc_ppctests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_selftests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_kernel_qemu success Successfully ran 24 jobs.
snowpatch_ozlabs/github-powerpc_sparse success Successfully ran 4 jobs.
snowpatch_ozlabs/github-powerpc_clang success Successfully ran 7 jobs.

Commit Message

Nicholas Piggin Oct. 4, 2021, 2:56 p.m. UTC
The machine check handler is not considered NMI on 64s. The early
handler is the true NMI handler, and then it schedules the
machine_check_exception handler to run when interrupts are enabled.

This works fine except the case of an unrecoverable MCE, where the true
NMI is taken when MSR[RI] is clear, it can not recover, so it calls
machine_check_exception directly so something might be done about it.

Calling an async handler from NMI context can result in irq state and
other things getting corrupted. This can also trigger the BUG at
  arch/powerpc/include/asm/interrupt.h:168
  BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));

Fix this by making an _async version of the handler which is called
in the normal case, and a NMI version that is called for unrecoverable
interrupts.

Fixes: 2b43dd7653cc ("powerpc/64: enable MSR[EE] in irq replay pt_regs")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h |  5 ++---
 arch/powerpc/kernel/exceptions-64s.S |  8 +++++--
 arch/powerpc/kernel/traps.c          | 31 ++++++++++++++++------------
 3 files changed, 26 insertions(+), 18 deletions(-)

Comments

Cédric Le Goater Oct. 4, 2021, 3:28 p.m. UTC | #1
On 10/4/21 16:56, Nicholas Piggin wrote:
> The machine check handler is not considered NMI on 64s. The early
> handler is the true NMI handler, and then it schedules the
> machine_check_exception handler to run when interrupts are enabled.
> 
> This works fine except the case of an unrecoverable MCE, where the true
> NMI is taken when MSR[RI] is clear, it can not recover, so it calls
> machine_check_exception directly so something might be done about it.
> 
> Calling an async handler from NMI context can result in irq state and
> other things getting corrupted. This can also trigger the BUG at
>    arch/powerpc/include/asm/interrupt.h:168
>    BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));

I was hitting this problem when I rebooted a P8 tuleta system and
this series fixes it.

Tested-by: Cédric Le Goater <clg@kaod.org>

Thanks,

C.
  
> Fix this by making an _async version of the handler which is called
> in the normal case, and a NMI version that is called for unrecoverable
> interrupts.
> 
> Fixes: 2b43dd7653cc ("powerpc/64: enable MSR[EE] in irq replay pt_regs")
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>> ---
>   arch/powerpc/include/asm/interrupt.h |  5 ++---
>   arch/powerpc/kernel/exceptions-64s.S |  8 +++++--
>   arch/powerpc/kernel/traps.c          | 31 ++++++++++++++++------------
>   3 files changed, 26 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
> index b894b7169706..a1d238255f07 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -528,10 +528,9 @@ static __always_inline long ____##func(struct pt_regs *regs)
>   /* kernel/traps.c */
>   DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
>   #ifdef CONFIG_PPC_BOOK3S_64
> -DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception);
> -#else
> -DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
> +DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async);
>   #endif
> +DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
>   DECLARE_INTERRUPT_HANDLER(SMIException);
>   DECLARE_INTERRUPT_HANDLER(handle_hmi_exception);
>   DECLARE_INTERRUPT_HANDLER(unknown_exception);
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 024d9231f88c..eaf1f72131a1 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -1243,7 +1243,7 @@ EXC_COMMON_BEGIN(machine_check_common)
>   	li	r10,MSR_RI
>   	mtmsrd 	r10,1
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
> -	bl	machine_check_exception
> +	bl	machine_check_exception_async
>   	b	interrupt_return_srr
>   
>   
> @@ -1303,7 +1303,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>   	subi	r12,r12,1
>   	sth	r12,PACA_IN_MCE(r13)
>   
> -	/* Invoke machine_check_exception to print MCE event and panic. */
> +	/*
> +	 * Invoke machine_check_exception to print MCE event and panic.
> +	 * This is the NMI version of the handler because we are called from
> +	 * the early handler which is a true NMI.
> +	 */
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	machine_check_exception
>   
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index e453b666613b..11741703d26e 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -796,24 +796,22 @@ void die_mce(const char *str, struct pt_regs *regs, long err)
>   	 * do_exit() checks for in_interrupt() and panics in that case, so
>   	 * exit the irq/nmi before calling die.
>   	 */
> -	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
> -		irq_exit();
> -	else
> +	if (in_nmi())
>   		nmi_exit();
> +	else
> +		irq_exit();
>   	die(str, regs, err);
>   }
>   
>   /*
> - * BOOK3S_64 does not call this handler as a non-maskable interrupt
> + * BOOK3S_64 does not usually call this handler as a non-maskable interrupt
>    * (it uses its own early real-mode handler to handle the MCE proper
>    * and then raises irq_work to call this handler when interrupts are
> - * enabled).
> + * enabled). The only time when this is not true is if the early handler
> + * is unrecoverable, then it does call this directly to try to get a
> + * message out.
>    */
> -#ifdef CONFIG_PPC_BOOK3S_64
> -DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
> -#else
> -DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> -#endif
> +static void __machine_check_exception(struct pt_regs *regs)
>   {
>   	int recover = 0;
>   
> @@ -847,12 +845,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
>   	/* Must die if the interrupt is not recoverable */
>   	if (regs_is_unrecoverable(regs))
>   		die_mce("Unrecoverable Machine check", regs, SIGBUS);
> +}
>   
>   #ifdef CONFIG_PPC_BOOK3S_64
> -	return;
> -#else
> -	return 0;
> +DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async)
> +{
> +	__machine_check_exception(regs);
> +}
>   #endif
> +DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> +{
> +	__machine_check_exception(regs);
> +
> +	return 0;
>   }
>   
>   DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */
>
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index b894b7169706..a1d238255f07 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -528,10 +528,9 @@  static __always_inline long ____##func(struct pt_regs *regs)
 /* kernel/traps.c */
 DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
 #ifdef CONFIG_PPC_BOOK3S_64
-DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception);
-#else
-DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
+DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async);
 #endif
+DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
 DECLARE_INTERRUPT_HANDLER(SMIException);
 DECLARE_INTERRUPT_HANDLER(handle_hmi_exception);
 DECLARE_INTERRUPT_HANDLER(unknown_exception);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 024d9231f88c..eaf1f72131a1 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1243,7 +1243,7 @@  EXC_COMMON_BEGIN(machine_check_common)
 	li	r10,MSR_RI
 	mtmsrd 	r10,1
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	machine_check_exception
+	bl	machine_check_exception_async
 	b	interrupt_return_srr
 
 
@@ -1303,7 +1303,11 @@  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	subi	r12,r12,1
 	sth	r12,PACA_IN_MCE(r13)
 
-	/* Invoke machine_check_exception to print MCE event and panic. */
+	/*
+	 * Invoke machine_check_exception to print MCE event and panic.
+	 * This is the NMI version of the handler because we are called from
+	 * the early handler which is a true NMI.
+	 */
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_exception
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index e453b666613b..11741703d26e 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -796,24 +796,22 @@  void die_mce(const char *str, struct pt_regs *regs, long err)
 	 * do_exit() checks for in_interrupt() and panics in that case, so
 	 * exit the irq/nmi before calling die.
 	 */
-	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
-		irq_exit();
-	else
+	if (in_nmi())
 		nmi_exit();
+	else
+		irq_exit();
 	die(str, regs, err);
 }
 
 /*
- * BOOK3S_64 does not call this handler as a non-maskable interrupt
+ * BOOK3S_64 does not usually call this handler as a non-maskable interrupt
  * (it uses its own early real-mode handler to handle the MCE proper
  * and then raises irq_work to call this handler when interrupts are
- * enabled).
+ * enabled). The only time when this is not true is if the early handler
+ * is unrecoverable, then it does call this directly to try to get a
+ * message out.
  */
-#ifdef CONFIG_PPC_BOOK3S_64
-DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
-#else
-DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
-#endif
+static void __machine_check_exception(struct pt_regs *regs)
 {
 	int recover = 0;
 
@@ -847,12 +845,19 @@  DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
 	/* Must die if the interrupt is not recoverable */
 	if (regs_is_unrecoverable(regs))
 		die_mce("Unrecoverable Machine check", regs, SIGBUS);
+}
 
 #ifdef CONFIG_PPC_BOOK3S_64
-	return;
-#else
-	return 0;
+DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async)
+{
+	__machine_check_exception(regs);
+}
 #endif
+DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
+{
+	__machine_check_exception(regs);
+
+	return 0;
 }
 
 DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */