diff mbox series

powerpc: Send SIGBUS from machine_check

Message ID 20201001170557.10915-1-joakim.tjernlund@infinera.com (mailing list archive)
State Changes Requested
Headers show
Series powerpc: Send SIGBUS from machine_check | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch warning Failed to apply on branch powerpc/merge (548ccca2a8864b7498ad8cc420fa01aecd4d4114)
snowpatch_ozlabs/apply_patch warning Failed to apply on branch powerpc/next (ebbfeef0d8093a06ff39c60105b6650be3344cbe)
snowpatch_ozlabs/apply_patch warning Failed to apply on branch linus/master (60e720931556fc1034d0981460164dcf02697679)
snowpatch_ozlabs/apply_patch warning Failed to apply on branch powerpc/fixes (0460534b532e5518c657c7d6492b9337d975eaa3)
snowpatch_ozlabs/apply_patch warning Failed to apply on branch linux-next (d39294091fee6b89d9c4a683bb19441b25098330)
snowpatch_ozlabs/apply_patch fail Failed to apply to any branch

Commit Message

Joakim Tjernlund Oct. 1, 2020, 5:05 p.m. UTC
Embedded PPC CPU should send SIGBUS to user space when applicable.

Signed-off-by: Joakim Tjernlund <joakim.tjernlund@infinera.com>
---
 arch/powerpc/kernel/traps.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

Comments

Joakim Tjernlund Oct. 22, 2020, 11:19 a.m. UTC | #1
ping

Also Cc: stable@vger.kernel.org

On Thu, 2020-10-01 at 19:05 +0200, Joakim Tjernlund wrote:
> Embedded PPC CPU should send SIGBUS to user space when applicable.
> 
> Signed-off-by: Joakim Tjernlund <joakim.tjernlund@infinera.com>
> ---
>  arch/powerpc/kernel/traps.c | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 0381242920d9..12715d24141c 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -621,6 +621,11 @@ int machine_check_e500mc(struct pt_regs *regs)
>  		       reason & MCSR_MEA ? "Effective" : "Physical", addr);
>  	}
>  
> 
> +	if ((user_mode(regs))) {
> +		_exception(SIGBUS, regs, reason, regs->nip);
> +		recoverable = 1;
> +	}
> +
>  silent_out:
>  	mtspr(SPRN_MCSR, mcsr);
>  	return mfspr(SPRN_MCSR) == 0 && recoverable;
> @@ -665,6 +670,10 @@ int machine_check_e500(struct pt_regs *regs)
>  	if (reason & MCSR_BUS_RPERR)
>  		printk("Bus - Read Parity Error\n");
>  
> 
> +	if ((user_mode(regs))) {
> +		_exception(SIGBUS, regs, reason, regs->nip);
> +		return 1;
> +	}
>  	return 0;
>  }
>  
> 
> @@ -695,6 +704,10 @@ int machine_check_e200(struct pt_regs *regs)
>  	if (reason & MCSR_BUS_WRERR)
>  		printk("Bus - Write Bus Error on buffered store or cache line push\n");
>  
> 
> +	if ((user_mode(regs))) {
> +		_exception(SIGBUS, regs, reason, regs->nip);
> +		return 1;
> +	}
>  	return 0;
>  }
>  #elif defined(CONFIG_PPC32)
> @@ -731,6 +744,10 @@ int machine_check_generic(struct pt_regs *regs)
>  	default:
>  		printk("Unknown values in msr\n");
>  	}
> +	if ((user_mode(regs))) {
> +		_exception(SIGBUS, regs, reason, regs->nip);
> +		return 1;
> +	}
>  	return 0;
>  }
>  #endif /* everything else */
Michael Ellerman Oct. 23, 2020, 12:57 a.m. UTC | #2
Joakim Tjernlund <joakim.tjernlund@infinera.com> writes:
> Embedded PPC CPU should send SIGBUS to user space when applicable.

Yeah, but it's not clear that it's applicable in all cases.

At least I need some reasoning for why it's safe in all cases below to
just send a SIGBUS and take no other action.

Is there a particular CPU you're working on? Can we start with that and
look at all the machine check causes and which can be safely handled.

Some comments below ...


> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 0381242920d9..12715d24141c 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -621,6 +621,11 @@ int machine_check_e500mc(struct pt_regs *regs)

At the beginning of the function we have:

	printk("Machine check in kernel mode.\n");

Which should be updated.

>  		       reason & MCSR_MEA ? "Effective" : "Physical", addr);
>  	}
>  
> +	if ((user_mode(regs))) {
> +		_exception(SIGBUS, regs, reason, regs->nip);
> +		recoverable = 1;
> +	}

For most of the error causes we take no action and set recoverable = 0.

Then you just declare that it is recoverable because it hit in
userspace. Depending on the cause that might be OK, but it's not
obviously correct in all cases.


> +
>  silent_out:
>  	mtspr(SPRN_MCSR, mcsr);
>  	return mfspr(SPRN_MCSR) == 0 && recoverable;
> @@ -665,6 +670,10 @@ int machine_check_e500(struct pt_regs *regs)

Same comment about the printk().

>  	if (reason & MCSR_BUS_RPERR)
>  		printk("Bus - Read Parity Error\n");
>  
> +	if ((user_mode(regs))) {
> +		_exception(SIGBUS, regs, reason, regs->nip);
> +		return 1;
> +	}

And same comment more or less.

Other than the MCSR_BUS_RBERR cases that are explicitly checked, the
function does nothing to clear the cause of the machine check.

>  	return 0;
>  }
>  
> @@ -695,6 +704,10 @@ int machine_check_e200(struct pt_regs *regs)
>  	if (reason & MCSR_BUS_WRERR)
>  		printk("Bus - Write Bus Error on buffered store or cache line push\n");
>  
> +	if ((user_mode(regs))) {
> +		_exception(SIGBUS, regs, reason, regs->nip);
> +		return 1;
> +	}

Same.

>  	return 0;
>  }
>  #elif defined(CONFIG_PPC32)
> @@ -731,6 +744,10 @@ int machine_check_generic(struct pt_regs *regs)
>  	default:
>  		printk("Unknown values in msr\n");
>  	}
> +	if ((user_mode(regs))) {
> +		_exception(SIGBUS, regs, reason, regs->nip);
> +		return 1;
> +	}

Same.

>  	return 0;
>  }
>  #endif /* everything else */
> -- 
> 2.26.2


cheers
Joakim Tjernlund Oct. 23, 2020, 9:23 a.m. UTC | #3
On Fri, 2020-10-23 at 11:57 +1100, Michael Ellerman wrote:
> 
> 
> Joakim Tjernlund <joakim.tjernlund@infinera.com> writes:
> > Embedded PPC CPU should send SIGBUS to user space when applicable.
> 
> Yeah, but it's not clear that it's applicable in all cases.
> 
> At least I need some reasoning for why it's safe in all cases below to
> just send a SIGBUS and take no other action.

For me this came from an User SDK accessing a PCI device(also using PCI IRQs) and this
SDK did some strange stuff during shutdown which disabled the device before SW was done.
This caused PCI accesses, both from User Space and kernel PCI IRQs access) to the device
which caused an Machine Check(PCI transfer failed). Without this patch, the kernel
would just OOPS and hang/do strange things even for an access made by User space.
Now the User app just gets a SIGBUS and the kernel still works as it should.

Perhaps a SIGBUS and recover isn't right in all cases but without it there will be a
system break down.


> Is there a particular CPU you're working on? Can we start with that and
> look at all the machine check causes and which can be safely handled.

This was a T1042(e5500) but we have e500 and mpc832x as well.

> 
> Some comments below ...
> 
> 
> > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> > index 0381242920d9..12715d24141c 100644
> > --- a/arch/powerpc/kernel/traps.c
> > +++ b/arch/powerpc/kernel/traps.c
> > @@ -621,6 +621,11 @@ int machine_check_e500mc(struct pt_regs *regs)
> 
> At the beginning of the function we have:
> 
>         printk("Machine check in kernel mode.\n");
> 
> Which should be updated.

Sure, just remove the "in kernel mode" perhaps?

> 
> >                      reason & MCSR_MEA ? "Effective" : "Physical", addr);
> >       }
> > 
> > +     if ((user_mode(regs))) {
> > +             _exception(SIGBUS, regs, reason, regs->nip);
> > +             recoverable = 1;
> > +     }
> 
> For most of the error causes we take no action and set recoverable = 0.
> 
> Then you just declare that it is recoverable because it hit in
> userspace. Depending on the cause that might be OK, but it's not
> obviously correct in all cases.

Not so familiar with PPC that I can make out what is OK or not.
I do think you stand a better chance now that before though.  

> 
> 
> > +
> >  silent_out:
> >       mtspr(SPRN_MCSR, mcsr);
> >       return mfspr(SPRN_MCSR) == 0 && recoverable;
> > @@ -665,6 +670,10 @@ int machine_check_e500(struct pt_regs *regs)
> 
> Same comment about the printk().
> 
> >       if (reason & MCSR_BUS_RPERR)
> >               printk("Bus - Read Parity Error\n");
> > 
> > +     if ((user_mode(regs))) {
> > +             _exception(SIGBUS, regs, reason, regs->nip);
> > +             return 1;
> > +     }
> 
> And same comment more or less.
> 
> Other than the MCSR_BUS_RBERR cases that are explicitly checked, the
> function does nothing to clear the cause of the machine check.
> 
> >       return 0;
> >  }
> > 
> > @@ -695,6 +704,10 @@ int machine_check_e200(struct pt_regs *regs)
> >       if (reason & MCSR_BUS_WRERR)
> >               printk("Bus - Write Bus Error on buffered store or cache line push\n");
> > 
> > +     if ((user_mode(regs))) {
> > +             _exception(SIGBUS, regs, reason, regs->nip);
> > +             return 1;
> > +     }
> 
> Same.
> 
> >       return 0;
> >  }
> >  #elif defined(CONFIG_PPC32)
> > @@ -731,6 +744,10 @@ int machine_check_generic(struct pt_regs *regs)
> >       default:
> >               printk("Unknown values in msr\n");
> >       }
> > +     if ((user_mode(regs))) {
> > +             _exception(SIGBUS, regs, reason, regs->nip);
> > +             return 1;
> > +     }
> 
> Same.
> 
> >       return 0;
> >  }
> >  #endif /* everything else */
> > --
> > 2.26.2
> 
> 
> cheers
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 0381242920d9..12715d24141c 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -621,6 +621,11 @@  int machine_check_e500mc(struct pt_regs *regs)
 		       reason & MCSR_MEA ? "Effective" : "Physical", addr);
 	}
 
+	if ((user_mode(regs))) {
+		_exception(SIGBUS, regs, reason, regs->nip);
+		recoverable = 1;
+	}
+
 silent_out:
 	mtspr(SPRN_MCSR, mcsr);
 	return mfspr(SPRN_MCSR) == 0 && recoverable;
@@ -665,6 +670,10 @@  int machine_check_e500(struct pt_regs *regs)
 	if (reason & MCSR_BUS_RPERR)
 		printk("Bus - Read Parity Error\n");
 
+	if ((user_mode(regs))) {
+		_exception(SIGBUS, regs, reason, regs->nip);
+		return 1;
+	}
 	return 0;
 }
 
@@ -695,6 +704,10 @@  int machine_check_e200(struct pt_regs *regs)
 	if (reason & MCSR_BUS_WRERR)
 		printk("Bus - Write Bus Error on buffered store or cache line push\n");
 
+	if ((user_mode(regs))) {
+		_exception(SIGBUS, regs, reason, regs->nip);
+		return 1;
+	}
 	return 0;
 }
 #elif defined(CONFIG_PPC32)
@@ -731,6 +744,10 @@  int machine_check_generic(struct pt_regs *regs)
 	default:
 		printk("Unknown values in msr\n");
 	}
+	if ((user_mode(regs))) {
+		_exception(SIGBUS, regs, reason, regs->nip);
+		return 1;
+	}
 	return 0;
 }
 #endif /* everything else */