diff mbox

[RESEND,v4] hw/xscom: Reset XSCOM engine after finite number of retries when busy

Message ID 201606060926.u569QgVd018651@mx0a-001b2d01.pphosted.com
State Accepted
Headers show

Commit Message

Vipin K Parashar June 6, 2016, 9:26 a.m. UTC
OPAL retries XSCOM read/write operations forever till it succeeds.
This can cause XSCOM ops to hang forever when XSCOM engine remains
busy for some reason. Changed it to retry XSCOM operations only
XSCOM_BUSY_MAX_RETRIES number of times instead of retrying forever.
Also added logic to reset XSCOM engine after XSCOM_BUSY_RESET_THRESHOLD
number of retries to unblock it when it remains busy.

Cc: stable # 9c2d82394fd2 ("xscom: Return OPAL_WRONG_STATE on XSCOM ops..")
Signed-off-by: Vipin K Parashar <vipin@linux.vnet.ibm.com>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
Changes in v4:
 - Changed xscom operation retries loop counter variable to 'retries'.
 - Added a FIXME comment to replace 10 ms delay with exact HW delay
   or better method to confirm XSCOM reset completion.

Changes in v3:
 - Added delay of 10ms after XSCOM engine reset when XSCOM is found busy.
 - Modified 'if' condition to check return value of xscom_handle_error().

Changes in v2:
 - Changed newly added macro names to slightly more intuitive names.
   Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if
   XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold threshold
   count for resetting XSCOM before retrying XSCOM operation again.

 hw/xscom.c         | 74 +++++++++++++++++++++++++++++++++++++++++-------------
 include/errorlog.h |  1 +
 include/xscom.h    |  6 +++++
 3 files changed, 64 insertions(+), 17 deletions(-)

Comments

Stewart Smith July 5, 2016, 7:16 a.m. UTC | #1
Vipin K Parashar <vipin@linux.vnet.ibm.com> writes:
> OPAL retries XSCOM read/write operations forever till it succeeds.
> This can cause XSCOM ops to hang forever when XSCOM engine remains
> busy for some reason. Changed it to retry XSCOM operations only
> XSCOM_BUSY_MAX_RETRIES number of times instead of retrying forever.
> Also added logic to reset XSCOM engine after XSCOM_BUSY_RESET_THRESHOLD
> number of retries to unblock it when it remains busy.
>
> Cc: stable # 9c2d82394fd2 ("xscom: Return OPAL_WRONG_STATE on XSCOM ops..")
> Signed-off-by: Vipin K Parashar <vipin@linux.vnet.ibm.com>
> Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
> ---
> Changes in v4:
>  - Changed xscom operation retries loop counter variable to 'retries'.
>  - Added a FIXME comment to replace 10 ms delay with exact HW delay
>    or better method to confirm XSCOM reset completion.
>
> Changes in v3:
>  - Added delay of 10ms after XSCOM engine reset when XSCOM is found busy.
>  - Modified 'if' condition to check return value of xscom_handle_error().
>
> Changes in v2:
>  - Changed newly added macro names to slightly more intuitive names.
>    Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if
>    XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold threshold
>    count for resetting XSCOM before retrying XSCOM operation again.
>
>  hw/xscom.c         | 74 +++++++++++++++++++++++++++++++++++++++++-------------
>  include/errorlog.h |  1 +
>  include/xscom.h    |  6 +++++
>  3 files changed, 64 insertions(+), 17 deletions(-)
>
> diff --git a/hw/xscom.c b/hw/xscom.c
> index 4d88ce5..2a91a7b 100644
> --- a/hw/xscom.c
> +++ b/hw/xscom.c
> @@ -23,6 +23,7 @@
>  #include <centaur.h>
>  #include <errorlog.h>
>  #include <opal-api.h>
> +#include <timebase.h>
>  
>  /* Mask of bits to clear in HMER before an access */
>  #define HMER_CLR_MASK	(~(SPR_HMER_XSCOM_FAIL | \
> @@ -41,6 +42,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
>  		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
>  		OPAL_NA);
>  
> +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
> +		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
> +		OPAL_NA);
> +
>  /* xscom details to trigger xstop */
>  static struct {
>  	uint64_t addr;
> @@ -118,18 +123,49 @@ static void xscom_reset(uint32_t gcid)
>  	 */
>  }
>  
> -static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
> -			       bool is_write)
> +static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
> +			      bool is_write, int64_t retries)
>  {
> +	struct timespec ts;
>  	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
>  
>  	/* XXX Figure out error codes from doc and error
>  	 * recovery procedures
>  	 */
>  	switch(stat) {
> -	/* XSCOM blocked, just retry */
> +	/*
> +	 * XSCOM engine is blocked, need to retry. Reset XSCOM engine
> +	 * after crossing retry threshold before retrying again.
> +	 */
>  	case 1:
> +		if (retries && !(retries  % XSCOM_BUSY_RESET_THRESHOLD)) {
> +			prlog(PR_NOTICE, "XSCOM: Busy even after %d retries, "
> +				"resetting XSCOM now. Total retries  = %lld\n",
> +				XSCOM_BUSY_RESET_THRESHOLD, retries);
> +			xscom_reset(gcid);
> +
> +			/*
> +			 * Its observed that sometimes immediate retry of
> +			 * XSCOM operation returns wrong data. Adding a
> +			 * delay for XSCOM reset to be effective. Delay of
> +			 * 10 ms is found to be working fine experimentally.
> +			 * FIXME: Replace 10ms delay by exact delay needed
> +			 * or other alternate method to confirm XSCOM reset
> +			 * completion, after checking from HW folks.
> +			 */
> +			ts.tv_sec = 0;
> +			ts.tv_nsec = 10 * 1000;
> +			nanosleep_nopoll(&ts, NULL);

As much as I dislike sleeping, I think this is about the best way to do
it without getting a bit nuts, and the previous behaviour was just to
sit in a tight loop and wait forever... and 10ms is significantly less
time than forever.

Merged to master as of e761222593a1ae932cddbc81239b6a7cd98ddb70
and cherry picked back to 5.2.x as of 86673d26ca2a2f49e3b4fa216140cfc8eae1500a
diff mbox

Patch

diff --git a/hw/xscom.c b/hw/xscom.c
index 4d88ce5..2a91a7b 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -23,6 +23,7 @@ 
 #include <centaur.h>
 #include <errorlog.h>
 #include <opal-api.h>
+#include <timebase.h>
 
 /* Mask of bits to clear in HMER before an access */
 #define HMER_CLR_MASK	(~(SPR_HMER_XSCOM_FAIL | \
@@ -41,6 +42,10 @@  DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
 		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
 		OPAL_NA);
 
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
 /* xscom details to trigger xstop */
 static struct {
 	uint64_t addr;
@@ -118,18 +123,49 @@  static void xscom_reset(uint32_t gcid)
 	 */
 }
 
-static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
-			       bool is_write)
+static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
+			      bool is_write, int64_t retries)
 {
+	struct timespec ts;
 	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
 
 	/* XXX Figure out error codes from doc and error
 	 * recovery procedures
 	 */
 	switch(stat) {
-	/* XSCOM blocked, just retry */
+	/*
+	 * XSCOM engine is blocked, need to retry. Reset XSCOM engine
+	 * after crossing retry threshold before retrying again.
+	 */
 	case 1:
+		if (retries && !(retries  % XSCOM_BUSY_RESET_THRESHOLD)) {
+			prlog(PR_NOTICE, "XSCOM: Busy even after %d retries, "
+				"resetting XSCOM now. Total retries  = %lld\n",
+				XSCOM_BUSY_RESET_THRESHOLD, retries);
+			xscom_reset(gcid);
+
+			/*
+			 * Its observed that sometimes immediate retry of
+			 * XSCOM operation returns wrong data. Adding a
+			 * delay for XSCOM reset to be effective. Delay of
+			 * 10 ms is found to be working fine experimentally.
+			 * FIXME: Replace 10ms delay by exact delay needed
+			 * or other alternate method to confirm XSCOM reset
+			 * completion, after checking from HW folks.
+			 */
+			ts.tv_sec = 0;
+			ts.tv_nsec = 10 * 1000;
+			nanosleep_nopoll(&ts, NULL);
+		}
+
+		/* Log error if we have retried enough and its still busy */
+		if (retries == XSCOM_BUSY_MAX_RETRIES)
+			log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
+				"XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
+				"stat=0x%x\n", is_write ? "write" : "read",
+				gcid, pcb_addr, stat);
 		return OPAL_BUSY;
+
 	/* CPU is asleep, don't retry */
 	case 2:
 		return OPAL_WRONG_STATE;
@@ -178,14 +214,14 @@  static bool xscom_gcid_ok(uint32_t gcid)
 static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
 {
 	uint64_t hmer;
-	int64_t ret;
+	int64_t ret, retries;
 
 	if (!xscom_gcid_ok(gcid)) {
 		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
 		return OPAL_PARAMETER;
 	}
 
-	for (;;) {
+	for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
 		/* Clear status bits in HMER (HMER is special
 		 * writing to it *ands* bits
 		 */
@@ -199,27 +235,29 @@  static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
 
 		/* Check for error */
 		if (!(hmer & SPR_HMER_XSCOM_FAIL))
-			break;
+			return OPAL_SUCCESS;
 
 		/* Handle error and possibly eventually retry */
-		ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
-		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
-			return ret;
+		ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
+		if (ret != OPAL_BUSY)
+			break;
 	}
-	return OPAL_SUCCESS;
+
+	prerror("XSCOM: Read failed, ret =  %lld\n", ret);
+	return ret;
 }
 
 static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
 {
 	uint64_t hmer;
-	int64_t ret;
+	int64_t ret, retries = 0;
 
 	if (!xscom_gcid_ok(gcid)) {
 		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
 		return OPAL_PARAMETER;
 	}
 
-	for (;;) {
+	for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
 		/* Clear status bits in HMER (HMER is special
 		 * writing to it *ands* bits
 		 */
@@ -233,14 +271,16 @@  static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
 
 		/* Check for error */
 		if (!(hmer & SPR_HMER_XSCOM_FAIL))
-			break;
+			return OPAL_SUCCESS;
 
 		/* Handle error and possibly eventually retry */
-		ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
-		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
-			return ret;
+		ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
+		if (ret != OPAL_BUSY)
+			break;
 	}
-	return OPAL_SUCCESS;
+
+	prerror("XSCOM: Write failed, ret =  %lld\n", ret);
+	return ret;
 }
 
 /*
diff --git a/include/errorlog.h b/include/errorlog.h
index b8fca7d..0a4237f 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -276,6 +276,7 @@  enum opal_reasoncode {
 	OPAL_RC_XSCOM_RW		= OPAL_XS | 0x10,
 	OPAL_RC_XSCOM_INDIRECT_RW	= OPAL_XS | 0x11,
 	OPAL_RC_XSCOM_RESET		= OPAL_XS | 0x12,
+	OPAL_RC_XSCOM_BUSY		= OPAL_XS | 0x13,
 /* PCI */
 	OPAL_RC_PCI_INIT_SLOT   = OPAL_PC | 0x10,
 	OPAL_RC_PCI_ADD_SLOT    = OPAL_PC | 0x11,
diff --git a/include/xscom.h b/include/xscom.h
index 933af6a..1aee40e 100644
--- a/include/xscom.h
+++ b/include/xscom.h
@@ -167,6 +167,12 @@ 
 /* HB folks say: try 10 time for now */
 #define XSCOM_IND_MAX_RETRIES		10
 
+/* Max number of retries when XSCOM remains busy */
+#define XSCOM_BUSY_MAX_RETRIES		3000
+
+/* Retry count after which to reset XSCOM, if still busy */
+#define XSCOM_BUSY_RESET_THRESHOLD	1000
+
 /*
  * Error handling:
  *