Message ID | 1463395432-10423-1-git-send-email-vipin@linux.vnet.ibm.com |
---|---|
State | Superseded |
Headers | show |
Sent out a v2 for this changing newly added macro names to slightly more intuitive names. Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold threshold count for resetting XSCOM before retrying XSCOM operation again. Regards, Vipin On Monday 16 May 2016 04:13 PM, Vipin K Parashar wrote: > OPAL retries XSCOM read/write operations forever till it succeeds. > In case XSCOM remains busy for some reason, it causes XSCOM ops to hang. > Added logic to retry XSCOM operations only XSCOM_OPS_MAX_RETRIES number > of times. Also added logic to reset XSCOM after XSCOM_BUSY_MAX_RETRIES > number of retries to unblock it, if it remains busy for some reason. > > Signed-off-by: Vipin K Parashar <vipin@linux.vnet.ibm.com> > Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> > --- > hw/xscom.c | 63 ++++++++++++++++++++++++++++++++++++++++-------------- > include/errorlog.h | 1 + > include/xscom.h | 6 ++++++ > 3 files changed, 54 insertions(+), 16 deletions(-) > > diff --git a/hw/xscom.c b/hw/xscom.c > index 84f72f5..04f1e33 100644 > --- a/hw/xscom.c > +++ b/hw/xscom.c > @@ -41,6 +41,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM, > OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, > OPAL_NA); > > +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM, > + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, > + OPAL_NA); > + > /* xscom details to trigger xstop */ > static struct { > uint64_t addr; > @@ -119,7 +123,7 @@ static void xscom_reset(uint32_t gcid) > } > > static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr, > - bool is_write) > + bool is_write, int64_t retries) > { > unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer); > > @@ -127,9 +131,26 @@ static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr, > * recovery procedures > */ > switch(stat) { > - /* XSCOM blocked, just retry */ > + /* > + * XSCOM blocked, need to retry. Reset XSCOM after > + * crossing retry threshold before retrying again. > + */ > case 1: > + if (retries && !(retries % XSCOM_BUSY_MAX_RETRIES)) { > + prlog(PR_NOTICE, "XSCOM: Busy!! Resetting after %d " > + "retries, Total retries = %lld\n", > + XSCOM_BUSY_MAX_RETRIES, retries); > + xscom_reset(gcid); > + } > + > + /* Log error if we have retried enough and its still busy */ > + if (retries == XSCOM_OPS_MAX_RETRIES) > + log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY), > + "XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x " > + "stat=0x%x\n", is_write ? "write" : "read", > + gcid, pcb_addr, stat); > return OPAL_BUSY; > + > /* CPU is asleep, don't retry */ > case 2: > return OPAL_WRONG_STATE; > @@ -177,15 +198,16 @@ static bool xscom_gcid_ok(uint32_t gcid) > */ > static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) > { > + int i; > uint64_t hmer; > - int64_t ret; > + int64_t ret, retries = 0; > > if (!xscom_gcid_ok(gcid)) { > prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); > return OPAL_PARAMETER; > } > > - for (;;) { > + for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) { > /* Clear status bits in HMER (HMER is special > * writing to it *ands* bits > */ > @@ -199,27 +221,32 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) > > /* Check for error */ > if (!(hmer & SPR_HMER_XSCOM_FAIL)) > - break; > + return OPAL_SUCCESS; > > /* Handle error and possibly eventually retry */ > - ret = xscom_handle_error(hmer, gcid, pcb_addr, false); > - if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE) > - return ret; > + ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries); > + if (ret == OPAL_BUSY) > + retries++; > + else > + break; > } > - return OPAL_SUCCESS; > + > + prerror("XSCOM: Read failed, ret = %lld\n", ret); > + return ret; > } > > static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) > { > + int i; > uint64_t hmer; > - int64_t ret; > + int64_t ret, retries = 0; > > if (!xscom_gcid_ok(gcid)) { > prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); > return OPAL_PARAMETER; > } > > - for (;;) { > + for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) { > /* Clear status bits in HMER (HMER is special > * writing to it *ands* bits > */ > @@ -233,14 +260,18 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) > > /* Check for error */ > if (!(hmer & SPR_HMER_XSCOM_FAIL)) > - break; > + return OPAL_SUCCESS; > > /* Handle error and possibly eventually retry */ > - ret = xscom_handle_error(hmer, gcid, pcb_addr, true); > - if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE) > - return ret; > + ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries); > + if (ret == OPAL_BUSY) > + retries++; > + else > + break; > } > - return OPAL_SUCCESS; > + > + prerror("XSCOM: Write failed, ret = %lld\n", ret); > + return ret; > } > > /* > diff --git a/include/errorlog.h b/include/errorlog.h > index ed90dab..214aed2 100644 > --- a/include/errorlog.h > +++ b/include/errorlog.h > @@ -275,6 +275,7 @@ enum opal_reasoncode { > OPAL_RC_XSCOM_RW = OPAL_XS | 0x10, > OPAL_RC_XSCOM_INDIRECT_RW = OPAL_XS | 0x11, > OPAL_RC_XSCOM_RESET = OPAL_XS | 0x12, > + OPAL_RC_XSCOM_BUSY = OPAL_XS | 0x13, > /* PCI */ > OPAL_RC_PCI_INIT_SLOT = OPAL_PC | 0x10, > OPAL_RC_PCI_ADD_SLOT = OPAL_PC | 0x11, > diff --git a/include/xscom.h b/include/xscom.h > index 933af6a..2055608 100644 > --- a/include/xscom.h > +++ b/include/xscom.h > @@ -167,6 +167,12 @@ > /* HB folks say: try 10 time for now */ > #define XSCOM_IND_MAX_RETRIES 10 > > +/* Max retry count for XSCOM ops */ > +#define XSCOM_OPS_MAX_RETRIES 3000 > + > +/* Retry count after which to reset XSCOM, if still busy */ > +#define XSCOM_BUSY_MAX_RETRIES 1000 > + > /* > * Error handling: > *
diff --git a/hw/xscom.c b/hw/xscom.c index 84f72f5..04f1e33 100644 --- a/hw/xscom.c +++ b/hw/xscom.c @@ -41,6 +41,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM, OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, OPAL_NA); +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM, + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL, + OPAL_NA); + /* xscom details to trigger xstop */ static struct { uint64_t addr; @@ -119,7 +123,7 @@ static void xscom_reset(uint32_t gcid) } static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr, - bool is_write) + bool is_write, int64_t retries) { unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer); @@ -127,9 +131,26 @@ static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr, * recovery procedures */ switch(stat) { - /* XSCOM blocked, just retry */ + /* + * XSCOM blocked, need to retry. Reset XSCOM after + * crossing retry threshold before retrying again. + */ case 1: + if (retries && !(retries % XSCOM_BUSY_MAX_RETRIES)) { + prlog(PR_NOTICE, "XSCOM: Busy!! Resetting after %d " + "retries, Total retries = %lld\n", + XSCOM_BUSY_MAX_RETRIES, retries); + xscom_reset(gcid); + } + + /* Log error if we have retried enough and its still busy */ + if (retries == XSCOM_OPS_MAX_RETRIES) + log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY), + "XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x " + "stat=0x%x\n", is_write ? "write" : "read", + gcid, pcb_addr, stat); return OPAL_BUSY; + /* CPU is asleep, don't retry */ case 2: return OPAL_WRONG_STATE; @@ -177,15 +198,16 @@ static bool xscom_gcid_ok(uint32_t gcid) */ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) { + int i; uint64_t hmer; - int64_t ret; + int64_t ret, retries = 0; if (!xscom_gcid_ok(gcid)) { prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); return OPAL_PARAMETER; } - for (;;) { + for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) { /* Clear status bits in HMER (HMER is special * writing to it *ands* bits */ @@ -199,27 +221,32 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) /* Check for error */ if (!(hmer & SPR_HMER_XSCOM_FAIL)) - break; + return OPAL_SUCCESS; /* Handle error and possibly eventually retry */ - ret = xscom_handle_error(hmer, gcid, pcb_addr, false); - if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE) - return ret; + ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries); + if (ret == OPAL_BUSY) + retries++; + else + break; } - return OPAL_SUCCESS; + + prerror("XSCOM: Read failed, ret = %lld\n", ret); + return ret; } static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) { + int i; uint64_t hmer; - int64_t ret; + int64_t ret, retries = 0; if (!xscom_gcid_ok(gcid)) { prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); return OPAL_PARAMETER; } - for (;;) { + for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) { /* Clear status bits in HMER (HMER is special * writing to it *ands* bits */ @@ -233,14 +260,18 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) /* Check for error */ if (!(hmer & SPR_HMER_XSCOM_FAIL)) - break; + return OPAL_SUCCESS; /* Handle error and possibly eventually retry */ - ret = xscom_handle_error(hmer, gcid, pcb_addr, true); - if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE) - return ret; + ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries); + if (ret == OPAL_BUSY) + retries++; + else + break; } - return OPAL_SUCCESS; + + prerror("XSCOM: Write failed, ret = %lld\n", ret); + return ret; } /* diff --git a/include/errorlog.h b/include/errorlog.h index ed90dab..214aed2 100644 --- a/include/errorlog.h +++ b/include/errorlog.h @@ -275,6 +275,7 @@ enum opal_reasoncode { OPAL_RC_XSCOM_RW = OPAL_XS | 0x10, OPAL_RC_XSCOM_INDIRECT_RW = OPAL_XS | 0x11, OPAL_RC_XSCOM_RESET = OPAL_XS | 0x12, + OPAL_RC_XSCOM_BUSY = OPAL_XS | 0x13, /* PCI */ OPAL_RC_PCI_INIT_SLOT = OPAL_PC | 0x10, OPAL_RC_PCI_ADD_SLOT = OPAL_PC | 0x11, diff --git a/include/xscom.h b/include/xscom.h index 933af6a..2055608 100644 --- a/include/xscom.h +++ b/include/xscom.h @@ -167,6 +167,12 @@ /* HB folks say: try 10 time for now */ #define XSCOM_IND_MAX_RETRIES 10 +/* Max retry count for XSCOM ops */ +#define XSCOM_OPS_MAX_RETRIES 3000 + +/* Retry count after which to reset XSCOM, if still busy */ +#define XSCOM_BUSY_MAX_RETRIES 1000 + /* * Error handling: *