diff mbox series

dts: spl_wakeup: Remove all workarounds in the spl wakeup logic

Message ID 1520572652-4185-1-git-send-email-shilpa.bhat@linux.vnet.ibm.com
State Accepted
Headers show
Series dts: spl_wakeup: Remove all workarounds in the spl wakeup logic | expand

Commit Message

Shilpasri G Bhat March 9, 2018, 5:17 a.m. UTC
We coded few workarounds in special wakeup logic to handle the
buggy firmware. Now that is fixed remove them as they break the
special wakeup protocol. As per the spec we should not de-assert
beofre assert is complete. So follow this protocol.

Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
---
 core/direct-controls.c | 59 +++++++++++++++++++++++++-------------------------
 hw/dts.c               | 30 +------------------------
 2 files changed, 30 insertions(+), 59 deletions(-)

Comments

Vaidyanathan Srinivasan March 9, 2018, 6:56 a.m. UTC | #1
* Shilpa Bhat <shilpa.bhat@linux.vnet.ibm.com> [2018-03-09 10:47:32]:

> We coded few workarounds in special wakeup logic to handle the
> buggy firmware. Now that is fixed remove them as they break the
> special wakeup protocol. As per the spec we should not de-assert
> beofre assert is complete. So follow this protocol.
> 
> Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Tested-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>



> ---
>  core/direct-controls.c | 59 +++++++++++++++++++++++++-------------------------
>  hw/dts.c               | 30 +------------------------
>  2 files changed, 30 insertions(+), 59 deletions(-)
> 
> diff --git a/core/direct-controls.c b/core/direct-controls.c
> index 45a4008..c7b9c9b 100644
> --- a/core/direct-controls.c
> +++ b/core/direct-controls.c
> @@ -311,7 +311,7 @@ static int p9_core_set_special_wakeup(struct cpu_thread *cpu)
>  		prlog(PR_ERR, "Could not set special wakeup on %u:%u:"
>  				" Unable to write PPM_SPECIAL_WKUP_HYP.\n",
>  				chip_id, core_id);
> -		return OPAL_HARDWARE;
> +		goto out_fail;
>  	}
>  
>  	for (i = 0; i < P9_SPWKUP_TIMEOUT / P9_SPWKUP_POLL_INTERVAL; i++) {
> @@ -321,9 +321,22 @@ static int p9_core_set_special_wakeup(struct cpu_thread *cpu)
>  					chip_id, core_id);
>  			goto out_fail;
>  		}
> -		if (val & P9_SPECIAL_WKUP_DONE)
> -			return 0;
> -
> +		if (val & P9_SPECIAL_WKUP_DONE) {
> +			/*
> +			 * CORE_GATED will be unset on a successful special
> +			 * wakeup of the core which indicates that the core is
> +			 * out of stop state. If CORE_GATED is still set then
> +			 * raise error.
> +			 */
> +			if (dctl_core_is_gated(cpu)) {
> +				prlog(PR_ERR, "Failed special wakeup on %u:%u"
> +						" as CORE_GATED is set\n",
> +						chip_id, core_id);
> +				goto out_fail;
> +			} else {
> +				return 0;
> +			}
> +		}
>  		time_wait_us(P9_SPWKUP_POLL_INTERVAL);
>  	}
>  
> @@ -332,10 +345,11 @@ static int p9_core_set_special_wakeup(struct cpu_thread *cpu)
>  			chip_id, core_id);
>  
>  out_fail:
> -	/* De-assert special wakeup after a small delay. */
> -	time_wait_us(1);
> -	xscom_write(chip_id, swake_addr, 0);
> -
> +	/*
> +	 * As per the special wakeup protocol we should not de-assert
> +	 * the special wakeup on the core until WAKEUP_DONE is set.
> +	 * So even on error do not de-assert.
> +	 */
>  	return OPAL_HARDWARE;


Recent learning around this issue suggest that software should not
clear special wakeup until it is set.  Otherwise the special wakeup
state machine gets into more errors and blocks the original failing
condition.

In ideal cases this should never happen.  There are fixes in microcode
to harden this area.  Hence as requested by hardware and microcode
team, leaving the special wakeup state machine as is on timeout.

Next attempt may succeed or fail, but will continue to leave the state
machine at the first error/failing state for debug by hardware team.

--Vaidy
Stewart Smith March 15, 2018, 9:13 a.m. UTC | #2
Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
> We coded few workarounds in special wakeup logic to handle the
> buggy firmware. Now that is fixed remove them as they break the
> special wakeup protocol. As per the spec we should not de-assert
> beofre assert is complete. So follow this protocol.
>
> Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
> ---
>  core/direct-controls.c | 59 +++++++++++++++++++++++++-------------------------
>  hw/dts.c               | 30 +------------------------
>  2 files changed, 30 insertions(+), 59 deletions(-)

great! Merged to master as of b5c9d09d067743be6ab12eef49f80562f0d41f02

Is it worth trying re-enabling stop4 and stop5 now? Was this one of the
issues causing failures there?
Vaidyanathan Srinivasan March 15, 2018, 10:14 a.m. UTC | #3
* Stewart Smith <stewart@linux.vnet.ibm.com> [2018-03-15 20:13:45]:

> Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
> > We coded few workarounds in special wakeup logic to handle the
> > buggy firmware. Now that is fixed remove them as they break the
> > special wakeup protocol. As per the spec we should not de-assert
> > beofre assert is complete. So follow this protocol.
> >
> > Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
> > ---
> >  core/direct-controls.c | 59 +++++++++++++++++++++++++-------------------------
> >  hw/dts.c               | 30 +------------------------
> >  2 files changed, 30 insertions(+), 59 deletions(-)
> 
> great! Merged to master as of b5c9d09d067743be6ab12eef49f80562f0d41f02
> 
> Is it worth trying re-enabling stop4 and stop5 now? Was this one of the
> issues causing failures there?

This patch helped capture CME/microcode issues at the first fail, then
root-caused and fixed. The HCODE has to roll out into a release before
we can re-enable stop4/5.  Special wakeup with stop4/5 has been stress
tested with the new HCODE that is yet to release.

Further we are removing DTS reads in favour of OCC sensors in the
other patch to avoid poking cores.

--Vaidy
Stewart Smith March 15, 2018, 8:59 p.m. UTC | #4
Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> writes:
> * Stewart Smith <stewart@linux.vnet.ibm.com> [2018-03-15 20:13:45]:
>
>> Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
>> > We coded few workarounds in special wakeup logic to handle the
>> > buggy firmware. Now that is fixed remove them as they break the
>> > special wakeup protocol. As per the spec we should not de-assert
>> > beofre assert is complete. So follow this protocol.
>> >
>> > Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
>> > ---
>> >  core/direct-controls.c | 59 +++++++++++++++++++++++++-------------------------
>> >  hw/dts.c               | 30 +------------------------
>> >  2 files changed, 30 insertions(+), 59 deletions(-)
>> 
>> great! Merged to master as of b5c9d09d067743be6ab12eef49f80562f0d41f02
>> 
>> Is it worth trying re-enabling stop4 and stop5 now? Was this one of the
>> issues causing failures there?
>
> This patch helped capture CME/microcode issues at the first fail, then
> root-caused and fixed. The HCODE has to roll out into a release before
> we can re-enable stop4/5.  Special wakeup with stop4/5 has been stress
> tested with the new HCODE that is yet to release.

Ok.

Corey - do you know which hcode this is and how far away it is from
hitting op-build?

> Further we are removing DTS reads in favour of OCC sensors in the
> other patch to avoid poking cores.

Yeah, I think that ends up making sense, I just have a desire to see the
op-test sensors test pass with poking the OCC sensors and stop 4/5 as
that's been really good at triggering these bugs.
diff mbox series

Patch

diff --git a/core/direct-controls.c b/core/direct-controls.c
index 45a4008..c7b9c9b 100644
--- a/core/direct-controls.c
+++ b/core/direct-controls.c
@@ -311,7 +311,7 @@  static int p9_core_set_special_wakeup(struct cpu_thread *cpu)
 		prlog(PR_ERR, "Could not set special wakeup on %u:%u:"
 				" Unable to write PPM_SPECIAL_WKUP_HYP.\n",
 				chip_id, core_id);
-		return OPAL_HARDWARE;
+		goto out_fail;
 	}
 
 	for (i = 0; i < P9_SPWKUP_TIMEOUT / P9_SPWKUP_POLL_INTERVAL; i++) {
@@ -321,9 +321,22 @@  static int p9_core_set_special_wakeup(struct cpu_thread *cpu)
 					chip_id, core_id);
 			goto out_fail;
 		}
-		if (val & P9_SPECIAL_WKUP_DONE)
-			return 0;
-
+		if (val & P9_SPECIAL_WKUP_DONE) {
+			/*
+			 * CORE_GATED will be unset on a successful special
+			 * wakeup of the core which indicates that the core is
+			 * out of stop state. If CORE_GATED is still set then
+			 * raise error.
+			 */
+			if (dctl_core_is_gated(cpu)) {
+				prlog(PR_ERR, "Failed special wakeup on %u:%u"
+						" as CORE_GATED is set\n",
+						chip_id, core_id);
+				goto out_fail;
+			} else {
+				return 0;
+			}
+		}
 		time_wait_us(P9_SPWKUP_POLL_INTERVAL);
 	}
 
@@ -332,10 +345,11 @@  static int p9_core_set_special_wakeup(struct cpu_thread *cpu)
 			chip_id, core_id);
 
 out_fail:
-	/* De-assert special wakeup after a small delay. */
-	time_wait_us(1);
-	xscom_write(chip_id, swake_addr, 0);
-
+	/*
+	 * As per the special wakeup protocol we should not de-assert
+	 * the special wakeup on the core until WAKEUP_DONE is set.
+	 * So even on error do not de-assert.
+	 */
 	return OPAL_HARDWARE;
 }
 
@@ -344,12 +358,8 @@  static int p9_core_clear_special_wakeup(struct cpu_thread *cpu)
 	uint32_t chip_id = pir_to_chip_id(cpu->pir);
 	uint32_t core_id = pir_to_core_id(cpu->pir);
 	uint32_t swake_addr;
-	uint32_t sshhyp_addr;
-	uint64_t val;
-	int i;
 
 	swake_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, EC_PPM_SPECIAL_WKUP_HYP);
-	sshhyp_addr = XSCOM_ADDR_P9_EC_SLAVE(core_id, P9_EC_PPM_SSHHYP);
 
 	/*
 	 * De-assert special wakeup after a small delay.
@@ -363,25 +373,14 @@  static int p9_core_clear_special_wakeup(struct cpu_thread *cpu)
 				chip_id, core_id);
 		return OPAL_HARDWARE;
 	}
-	time_wait_us(1);
 
-	for (i = 0; i < P9_SPWKUP_TIMEOUT / P9_SPWKUP_POLL_INTERVAL; i++) {
-		if (xscom_read(chip_id, sshhyp_addr, &val)) {
-			prlog(PR_ERR, "Could not clear special wakeup on %u:%u:"
-					" Unable to read PPM_SSHHYP.\n",
-					chip_id, core_id);
-			return OPAL_HARDWARE;
-		}
-		if (!(val & P9_SPECIAL_WKUP_DONE))
-			return 0;
-
-		time_wait_us(P9_SPWKUP_POLL_INTERVAL);
-	}
-
-	prlog(PR_ERR, "Could not clear special wakeup on %u:%u:"
-			" timeout waiting for clear of SPECIAL_WKUP_DONE.\n",
-			chip_id, core_id);
-	return OPAL_HARDWARE;
+	/*
+	 * Don't wait for de-assert to complete as other components
+	 * could have requested for special wkeup. Wait for 10ms to
+	 * avoid back-to-back asserts
+	 */
+	time_wait_us(10000);
+	return 0;
 }
 
 static int p9_thread_quiesced(struct cpu_thread *cpu)
diff --git a/hw/dts.c b/hw/dts.c
index ecfe847..949791b 100644
--- a/hw/dts.c
+++ b/hw/dts.c
@@ -236,34 +236,6 @@  static int dts_read_core_temp_p9(uint32_t pir, struct dts *dts)
 	return 0;
 }
 
-static int dts_wakeup_core(struct cpu_thread *cpu)
-{
-	int retries = 10;
-	int rc;
-
-	while (retries-- > 0) {
-		rc = dctl_set_special_wakeup(cpu);
-		if (rc) {
-			prerror("Failed to set special wakeup on %d (%d)\n",
-				cpu->pir, rc);
-			return rc;
-		}
-
-		if (!dctl_core_is_gated(cpu))
-			break;
-
-		prlog(PR_NOTICE, "Retrying special wakeup on %d\n", cpu->pir);
-		rc = dctl_clear_special_wakeup(cpu);
-		if (rc) {
-			prerror("Failed to clear special wakeup on %d (%d)\n",
-				cpu->pir, rc);
-			return rc;
-		}
-	}
-
-	return rc;
-}
-
 static void dts_async_read_temp(struct timer *t __unused, void *data,
 				u64 now __unused)
 {
@@ -271,7 +243,7 @@  static void dts_async_read_temp(struct timer *t __unused, void *data,
 	int rc, swkup_rc;
 	struct cpu_thread *cpu = data;
 
-	swkup_rc = dts_wakeup_core(cpu);
+	swkup_rc = dctl_set_special_wakeup(cpu);
 
 	rc = dts_read_core_temp_p9(cpu->pir, &dts);
 	if (!rc) {