diff mbox

[5/7] hmi: Don't cause an unrecoverable HMI if a CPU is asleep

Message ID 1458027237-8926-6-git-send-email-ruscur@russell.cc
State Superseded
Headers show

Commit Message

Russell Currey March 15, 2016, 7:33 a.m. UTC
decode_core_fir looks at the FIR for every CPU.  If the XSCOM read
fails, which it will if the CPU is asleep, the code path results in
raising an unrecoverable HMI.  This is incorrect as if the CPU is
asleep, it's likely not the cause of any problems.

Resolve this by skipping the CPU if it's asleep.

If the sleeping CPU was actually the cause of the HMI and no other
components were found to have errors (i.e other CPUs, NX, CAPP), an
unknown, unrecoverable HMI is raised anyway.  This patch just prevents
unrecoverable errors being thrown when a recoverable component has a HMI
and a CPU happens to be sleeping.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 core/hmi.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Comments

Alistair Popple March 17, 2016, 5:22 a.m. UTC | #1
On Tue, 15 Mar 2016 18:33:55 Russell Currey wrote:
> decode_core_fir looks at the FIR for every CPU.  If the XSCOM read
> fails, which it will if the CPU is asleep, the code path results in
> raising an unrecoverable HMI.  This is incorrect as if the CPU is
> asleep, it's likely not the cause of any problems.
> 
> Resolve this by skipping the CPU if it's asleep.
> 
> If the sleeping CPU was actually the cause of the HMI and no other
> components were found to have errors (i.e other CPUs, NX, CAPP), an
> unknown, unrecoverable HMI is raised anyway.  This patch just prevents
> unrecoverable errors being thrown when a recoverable component has a HMI
> and a CPU happens to be sleeping.
> 
> Signed-off-by: Russell Currey <ruscur@russell.cc>
> ---
>  core/hmi.c | 13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index 127686f..09bf610 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -299,6 +299,7 @@ static bool decode_core_fir(struct cpu_thread *cpu,
>  	uint32_t core_id;
>  	int i;
>  	bool found = false;
> +	int64_t ret;
>  
>  	/* Sanity check */
>  	if (!cpu || !hmi_evt)
> @@ -307,10 +308,18 @@ static bool decode_core_fir(struct cpu_thread *cpu,
>  	core_id = pir_to_core_id(cpu->pir);
>  
>  	/* Get CORE FIR register value. */
> -	if (xscom_read(cpu->chip_id, XSCOM_ADDR_P8_EX(core_id, CORE_FIR),
> -							&core_fir) != 0) {
> +	ret = xscom_read(cpu->chip_id, XSCOM_ADDR_P8_EX(core_id, CORE_FIR),
> +			 &core_fir);
> +
> +	if (ret == OPAL_HARDWARE) {
>  		prerror("HMI: XSCOM error reading CORE FIR\n");
>  		return false;
> +	} else if (ret == OPAL_WRONG_STATE) {
> +		/* CPU is asleep, so it probably didn't cause the checkstop */

It might be worth expanding this comment with an explanation of what happens 
if this core is the cause of the checkstop.

> +		prlog(PR_DEBUG,
> +		      "HMI: FIR read failed, chip %d core %d asleep\n",
> +		      cpu->chip_id, core_id);
> +		return true;

Doesn't returning true here cause a HMI event to be raised by 
queue_hmi_event() in find_core_checkstop_reason()? It looks like 
decode_core_fir() returns true when a core FIR is set so I don't see how 
callers could differentiate each case?

>  	}
>  
>  	prlog(PR_INFO, "HMI: CHIP ID: %x, CORE ID: %x, FIR: %016llx\n",
>
Russell Currey March 17, 2016, 5:27 a.m. UTC | #2
On Thu, 2016-03-17 at 16:22 +1100, Alistair Popple wrote:
> On Tue, 15 Mar 2016 18:33:55 Russell Currey wrote:
> > 
> > decode_core_fir looks at the FIR for every CPU.  If the XSCOM read
> > fails, which it will if the CPU is asleep, the code path results in
> > raising an unrecoverable HMI.  This is incorrect as if the CPU is
> > asleep, it's likely not the cause of any problems.
> > 
> > Resolve this by skipping the CPU if it's asleep.
> > 
> > If the sleeping CPU was actually the cause of the HMI and no other
> > components were found to have errors (i.e other CPUs, NX, CAPP), an
> > unknown, unrecoverable HMI is raised anyway.  This patch just prevents
> > unrecoverable errors being thrown when a recoverable component has a
> > HMI
> > and a CPU happens to be sleeping.
> > 
> > Signed-off-by: Russell Currey <ruscur@russell.cc>
> > ---
> >  core/hmi.c | 13 +++++++++++--
> >  1 file changed, 11 insertions(+), 2 deletions(-)
> > 
> > diff --git a/core/hmi.c b/core/hmi.c
> > index 127686f..09bf610 100644
> > --- a/core/hmi.c
> > +++ b/core/hmi.c
> > @@ -299,6 +299,7 @@ static bool decode_core_fir(struct cpu_thread *cpu,
> >  	uint32_t core_id;
> >  	int i;
> >  	bool found = false;
> > +	int64_t ret;
> >  
> >  	/* Sanity check */
> >  	if (!cpu || !hmi_evt)
> > @@ -307,10 +308,18 @@ static bool decode_core_fir(struct cpu_thread
> > *cpu,
> >  	core_id = pir_to_core_id(cpu->pir);
> >  
> >  	/* Get CORE FIR register value. */
> > -	if (xscom_read(cpu->chip_id, XSCOM_ADDR_P8_EX(core_id,
> > CORE_FIR),
> > -							&core_fir) !=
> > 0) {
> > +	ret = xscom_read(cpu->chip_id, XSCOM_ADDR_P8_EX(core_id,
> > CORE_FIR),
> > +			 &core_fir);
> > +
> > +	if (ret == OPAL_HARDWARE) {
> >  		prerror("HMI: XSCOM error reading CORE FIR\n");
> >  		return false;
> > +	} else if (ret == OPAL_WRONG_STATE) {
> > +		/* CPU is asleep, so it probably didn't cause the
> > checkstop */
> It might be worth expanding this comment with an explanation of what
> happens 
> if this core is the cause of the checkstop.
> 
> > 
> > +		prlog(PR_DEBUG,
> > +		      "HMI: FIR read failed, chip %d core %d
> > asleep\n",
> > +		      cpu->chip_id, core_id);
> > +		return true;
> Doesn't returning true here cause a HMI event to be raised by 
> queue_hmi_event() in find_core_checkstop_reason()? It looks like 
> decode_core_fir() returns true when a core FIR is set so I don't see how 
> callers could differentiate each case?
You're right, it should return false, I got confused.  This should return
false and the XSCOM read error case should return true and raise a HMI of
some kind, there's no way that should *just* be an error message.
> 
> > 
> >  	}
> >  
> >  	prlog(PR_INFO, "HMI: CHIP ID: %x, CORE ID: %x, FIR:
> > %016llx\n",
> >
diff mbox

Patch

diff --git a/core/hmi.c b/core/hmi.c
index 127686f..09bf610 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -299,6 +299,7 @@  static bool decode_core_fir(struct cpu_thread *cpu,
 	uint32_t core_id;
 	int i;
 	bool found = false;
+	int64_t ret;
 
 	/* Sanity check */
 	if (!cpu || !hmi_evt)
@@ -307,10 +308,18 @@  static bool decode_core_fir(struct cpu_thread *cpu,
 	core_id = pir_to_core_id(cpu->pir);
 
 	/* Get CORE FIR register value. */
-	if (xscom_read(cpu->chip_id, XSCOM_ADDR_P8_EX(core_id, CORE_FIR),
-							&core_fir) != 0) {
+	ret = xscom_read(cpu->chip_id, XSCOM_ADDR_P8_EX(core_id, CORE_FIR),
+			 &core_fir);
+
+	if (ret == OPAL_HARDWARE) {
 		prerror("HMI: XSCOM error reading CORE FIR\n");
 		return false;
+	} else if (ret == OPAL_WRONG_STATE) {
+		/* CPU is asleep, so it probably didn't cause the checkstop */
+		prlog(PR_DEBUG,
+		      "HMI: FIR read failed, chip %d core %d asleep\n",
+		      cpu->chip_id, core_id);
+		return true;
 	}
 
 	prlog(PR_INFO, "HMI: CHIP ID: %x, CORE ID: %x, FIR: %016llx\n",