diff mbox

hw/xscom: Reset XSCOM after finite number of retries when busy

Message ID 1463395432-10423-1-git-send-email-vipin@linux.vnet.ibm.com
State Superseded
Headers show

Commit Message

Vipin K Parashar May 16, 2016, 10:43 a.m. UTC
OPAL retries XSCOM read/write operations forever till it succeeds.
In case XSCOM remains busy for some reason, it causes XSCOM ops to hang.
Added logic to retry XSCOM operations only XSCOM_OPS_MAX_RETRIES number
of times. Also added logic to reset XSCOM after XSCOM_BUSY_MAX_RETRIES
number of retries to unblock it, if it remains busy for some reason.

Signed-off-by: Vipin K Parashar <vipin@linux.vnet.ibm.com>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
 hw/xscom.c         | 63 ++++++++++++++++++++++++++++++++++++++++--------------
 include/errorlog.h |  1 +
 include/xscom.h    |  6 ++++++
 3 files changed, 54 insertions(+), 16 deletions(-)

Comments

Vipin K Parashar May 16, 2016, 4:15 p.m. UTC | #1
Sent out a v2 for this changing newly added macro names to slightly
more intuitive names.

Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if
XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold
threshold count for resetting XSCOM before retrying XSCOM operation
again.

Regards,
Vipin

On Monday 16 May 2016 04:13 PM, Vipin K Parashar wrote:
> OPAL retries XSCOM read/write operations forever till it succeeds.
> In case XSCOM remains busy for some reason, it causes XSCOM ops to hang.
> Added logic to retry XSCOM operations only XSCOM_OPS_MAX_RETRIES number
> of times. Also added logic to reset XSCOM after XSCOM_BUSY_MAX_RETRIES
> number of retries to unblock it, if it remains busy for some reason.
>
> Signed-off-by: Vipin K Parashar <vipin@linux.vnet.ibm.com>
> Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
> ---
>   hw/xscom.c         | 63 ++++++++++++++++++++++++++++++++++++++++--------------
>   include/errorlog.h |  1 +
>   include/xscom.h    |  6 ++++++
>   3 files changed, 54 insertions(+), 16 deletions(-)
>
> diff --git a/hw/xscom.c b/hw/xscom.c
> index 84f72f5..04f1e33 100644
> --- a/hw/xscom.c
> +++ b/hw/xscom.c
> @@ -41,6 +41,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
>   		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
>   		OPAL_NA);
>
> +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
> +		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
> +		OPAL_NA);
> +
>   /* xscom details to trigger xstop */
>   static struct {
>   	uint64_t addr;
> @@ -119,7 +123,7 @@ static void xscom_reset(uint32_t gcid)
>   }
>
>   static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
> -			       bool is_write)
> +			      bool is_write, int64_t retries)
>   {
>   	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
>
> @@ -127,9 +131,26 @@ static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
>   	 * recovery procedures
>   	 */
>   	switch(stat) {
> -	/* XSCOM blocked, just retry */
> +	/*
> +	 * XSCOM blocked, need to retry. Reset XSCOM after
> +	 * crossing retry threshold before retrying again.
> +	 */
>   	case 1:
> +		if (retries && !(retries  % XSCOM_BUSY_MAX_RETRIES)) {
> +			prlog(PR_NOTICE, "XSCOM: Busy!! Resetting after %d "
> +				"retries, Total retries  = %lld\n",
> +				XSCOM_BUSY_MAX_RETRIES, retries);
> +			xscom_reset(gcid);
> +		}
> +
> +		/* Log error if we have retried enough and its still busy */
> +		if (retries == XSCOM_OPS_MAX_RETRIES)
> +			log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
> +				"XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
> +				"stat=0x%x\n", is_write ? "write" : "read",
> +				gcid, pcb_addr, stat);
>   		return OPAL_BUSY;
> +
>   	/* CPU is asleep, don't retry */
>   	case 2:
>   		return OPAL_WRONG_STATE;
> @@ -177,15 +198,16 @@ static bool xscom_gcid_ok(uint32_t gcid)
>    */
>   static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>   {
> +	int i;
>   	uint64_t hmer;
> -	int64_t ret;
> +	int64_t ret, retries = 0;
>
>   	if (!xscom_gcid_ok(gcid)) {
>   		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
>   		return OPAL_PARAMETER;
>   	}
>
> -	for (;;) {
> +	for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) {
>   		/* Clear status bits in HMER (HMER is special
>   		 * writing to it *ands* bits
>   		 */
> @@ -199,27 +221,32 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>
>   		/* Check for error */
>   		if (!(hmer & SPR_HMER_XSCOM_FAIL))
> -			break;
> +			return OPAL_SUCCESS;
>
>   		/* Handle error and possibly eventually retry */
> -		ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
> -		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
> -			return ret;
> +		ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
> +		if (ret == OPAL_BUSY)
> +			retries++;
> +		else
> +			break;
>   	}
> -	return OPAL_SUCCESS;
> +
> +	prerror("XSCOM: Read failed, ret =  %lld\n", ret);
> +	return ret;
>   }
>
>   static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>   {
> +	int i;
>   	uint64_t hmer;
> -	int64_t ret;
> +	int64_t ret, retries = 0;
>
>   	if (!xscom_gcid_ok(gcid)) {
>   		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
>   		return OPAL_PARAMETER;
>   	}
>
> -	for (;;) {
> +	for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) {
>   		/* Clear status bits in HMER (HMER is special
>   		 * writing to it *ands* bits
>   		 */
> @@ -233,14 +260,18 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>
>   		/* Check for error */
>   		if (!(hmer & SPR_HMER_XSCOM_FAIL))
> -			break;
> +			return OPAL_SUCCESS;
>
>   		/* Handle error and possibly eventually retry */
> -		ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
> -		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
> -			return ret;
> +		ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
> +		if (ret == OPAL_BUSY)
> +			retries++;
> +		else
> +			break;
>   	}
> -	return OPAL_SUCCESS;
> +
> +	prerror("XSCOM: Write failed, ret =  %lld\n", ret);
> +	return ret;
>   }
>
>   /*
> diff --git a/include/errorlog.h b/include/errorlog.h
> index ed90dab..214aed2 100644
> --- a/include/errorlog.h
> +++ b/include/errorlog.h
> @@ -275,6 +275,7 @@ enum opal_reasoncode {
>   	OPAL_RC_XSCOM_RW		= OPAL_XS | 0x10,
>   	OPAL_RC_XSCOM_INDIRECT_RW	= OPAL_XS | 0x11,
>   	OPAL_RC_XSCOM_RESET		= OPAL_XS | 0x12,
> +	OPAL_RC_XSCOM_BUSY		= OPAL_XS | 0x13,
>   /* PCI */
>   	OPAL_RC_PCI_INIT_SLOT   = OPAL_PC | 0x10,
>   	OPAL_RC_PCI_ADD_SLOT    = OPAL_PC | 0x11,
> diff --git a/include/xscom.h b/include/xscom.h
> index 933af6a..2055608 100644
> --- a/include/xscom.h
> +++ b/include/xscom.h
> @@ -167,6 +167,12 @@
>   /* HB folks say: try 10 time for now */
>   #define XSCOM_IND_MAX_RETRIES		10
>
> +/* Max retry count for XSCOM ops */
> +#define XSCOM_OPS_MAX_RETRIES		3000
> +
> +/* Retry count after which to reset XSCOM, if still busy */
> +#define XSCOM_BUSY_MAX_RETRIES		1000
> +
>   /*
>    * Error handling:
>    *
diff mbox

Patch

diff --git a/hw/xscom.c b/hw/xscom.c
index 84f72f5..04f1e33 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -41,6 +41,10 @@  DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
 		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
 		OPAL_NA);
 
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
 /* xscom details to trigger xstop */
 static struct {
 	uint64_t addr;
@@ -119,7 +123,7 @@  static void xscom_reset(uint32_t gcid)
 }
 
 static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
-			       bool is_write)
+			      bool is_write, int64_t retries)
 {
 	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
 
@@ -127,9 +131,26 @@  static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
 	 * recovery procedures
 	 */
 	switch(stat) {
-	/* XSCOM blocked, just retry */
+	/*
+	 * XSCOM blocked, need to retry. Reset XSCOM after
+	 * crossing retry threshold before retrying again.
+	 */
 	case 1:
+		if (retries && !(retries  % XSCOM_BUSY_MAX_RETRIES)) {
+			prlog(PR_NOTICE, "XSCOM: Busy!! Resetting after %d "
+				"retries, Total retries  = %lld\n",
+				XSCOM_BUSY_MAX_RETRIES, retries);
+			xscom_reset(gcid);
+		}
+
+		/* Log error if we have retried enough and its still busy */
+		if (retries == XSCOM_OPS_MAX_RETRIES)
+			log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
+				"XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
+				"stat=0x%x\n", is_write ? "write" : "read",
+				gcid, pcb_addr, stat);
 		return OPAL_BUSY;
+
 	/* CPU is asleep, don't retry */
 	case 2:
 		return OPAL_WRONG_STATE;
@@ -177,15 +198,16 @@  static bool xscom_gcid_ok(uint32_t gcid)
  */
 static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
 {
+	int i;
 	uint64_t hmer;
-	int64_t ret;
+	int64_t ret, retries = 0;
 
 	if (!xscom_gcid_ok(gcid)) {
 		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
 		return OPAL_PARAMETER;
 	}
 
-	for (;;) {
+	for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) {
 		/* Clear status bits in HMER (HMER is special
 		 * writing to it *ands* bits
 		 */
@@ -199,27 +221,32 @@  static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
 
 		/* Check for error */
 		if (!(hmer & SPR_HMER_XSCOM_FAIL))
-			break;
+			return OPAL_SUCCESS;
 
 		/* Handle error and possibly eventually retry */
-		ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
-		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
-			return ret;
+		ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
+		if (ret == OPAL_BUSY)
+			retries++;
+		else
+			break;
 	}
-	return OPAL_SUCCESS;
+
+	prerror("XSCOM: Read failed, ret =  %lld\n", ret);
+	return ret;
 }
 
 static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
 {
+	int i;
 	uint64_t hmer;
-	int64_t ret;
+	int64_t ret, retries = 0;
 
 	if (!xscom_gcid_ok(gcid)) {
 		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
 		return OPAL_PARAMETER;
 	}
 
-	for (;;) {
+	for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) {
 		/* Clear status bits in HMER (HMER is special
 		 * writing to it *ands* bits
 		 */
@@ -233,14 +260,18 @@  static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
 
 		/* Check for error */
 		if (!(hmer & SPR_HMER_XSCOM_FAIL))
-			break;
+			return OPAL_SUCCESS;
 
 		/* Handle error and possibly eventually retry */
-		ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
-		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
-			return ret;
+		ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
+		if (ret == OPAL_BUSY)
+			retries++;
+		else
+			break;
 	}
-	return OPAL_SUCCESS;
+
+	prerror("XSCOM: Write failed, ret =  %lld\n", ret);
+	return ret;
 }
 
 /*
diff --git a/include/errorlog.h b/include/errorlog.h
index ed90dab..214aed2 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -275,6 +275,7 @@  enum opal_reasoncode {
 	OPAL_RC_XSCOM_RW		= OPAL_XS | 0x10,
 	OPAL_RC_XSCOM_INDIRECT_RW	= OPAL_XS | 0x11,
 	OPAL_RC_XSCOM_RESET		= OPAL_XS | 0x12,
+	OPAL_RC_XSCOM_BUSY		= OPAL_XS | 0x13,
 /* PCI */
 	OPAL_RC_PCI_INIT_SLOT   = OPAL_PC | 0x10,
 	OPAL_RC_PCI_ADD_SLOT    = OPAL_PC | 0x11,
diff --git a/include/xscom.h b/include/xscom.h
index 933af6a..2055608 100644
--- a/include/xscom.h
+++ b/include/xscom.h
@@ -167,6 +167,12 @@ 
 /* HB folks say: try 10 time for now */
 #define XSCOM_IND_MAX_RETRIES		10
 
+/* Max retry count for XSCOM ops */
+#define XSCOM_OPS_MAX_RETRIES		3000
+
+/* Retry count after which to reset XSCOM, if still busy */
+#define XSCOM_BUSY_MAX_RETRIES		1000
+
 /*
  * Error handling:
  *