Message ID | 20181207212811.12826-2-gpiccoli@canonical.com |
---|---|
State | New |
Headers | show |
Series | [SRU,X,1/1] nvme/pci: Poll CQ on timeout | expand |
On 12/7/18 10:28 PM, Guilherme G. Piccoli wrote: > From: Keith Busch <keith.busch@intel.com> > > BugLink: https://launchpad.net/bugs/1807393 > > If an IO timeout occurs, it's helpful to know if the controller did not > post a completion or the driver missed an interrupt. While we never expect > the latter, this patch will make it possible to tell the difference so > we don't have to guess. > > Signed-off-by: Keith Busch <keith.busch@intel.com> > Signed-off-by: Christoph Hellwig <hch@lst.de> > Tested-by: Johannes Thumshirn <jthumshirn@suse.de> > Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> > (backported from 7776db1ccc123d5944a8c170c9c45f7e91d49643 upstream) > [gpiccoli: context adjustment, fixed struct member access that changed] > Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com> Thanks for the additional info, Guilherme. Upstream fix with low regression potential: Acked-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> > --- > drivers/nvme/host/pci.c | 21 ++++++++++++++++++--- > 1 file changed, 18 insertions(+), 3 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 3752052ae20a..5d2a7ee2f922 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -929,10 +929,8 @@ static irqreturn_t nvme_irq_check(int irq, void *data) > return IRQ_NONE; > } > > -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) > +static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) > { > - struct nvme_queue *nvmeq = hctx->driver_data; > - > if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { > spin_lock_irq(&nvmeq->q_lock); > __nvme_process_cq(nvmeq, &tag); > @@ -945,6 +943,13 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) > return 0; > } > > +static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) > +{ > + struct nvme_queue *nvmeq = hctx->driver_data; > + > + return __nvme_poll(nvmeq, tag); > +} > + > static void nvme_async_event_work(struct work_struct *work) > { > struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work); > @@ -1045,6 +1050,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) > struct request *abort_req; > struct nvme_command cmd; > > + /* > + * Did we miss an interrupt? > + */ > + if (__nvme_poll(nvmeq, req->tag)) { > + dev_warn(dev->dev, > + "I/O %d QID %d timeout, completion polled\n", > + req->tag, nvmeq->qid); > + return BLK_EH_HANDLED; > + } > + > /* > * Shutdown immediately if controller times out while starting. The > * reset work will see the pci device disabled when it gets the forced
On 07.12.18 22:28, Guilherme G. Piccoli wrote: > From: Keith Busch <keith.busch@intel.com> > > BugLink: https://launchpad.net/bugs/1807393 > > If an IO timeout occurs, it's helpful to know if the controller did not > post a completion or the driver missed an interrupt. While we never expect > the latter, this patch will make it possible to tell the difference so > we don't have to guess. > > Signed-off-by: Keith Busch <keith.busch@intel.com> > Signed-off-by: Christoph Hellwig <hch@lst.de> > Tested-by: Johannes Thumshirn <jthumshirn@suse.de> > Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> > (backported from 7776db1ccc123d5944a8c170c9c45f7e91d49643 upstream) > [gpiccoli: context adjustment, fixed struct member access that changed] > Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com> Acked-by: Stefan Bader <stefan.bader@canonical.com> > --- > drivers/nvme/host/pci.c | 21 ++++++++++++++++++--- > 1 file changed, 18 insertions(+), 3 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 3752052ae20a..5d2a7ee2f922 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -929,10 +929,8 @@ static irqreturn_t nvme_irq_check(int irq, void *data) > return IRQ_NONE; > } > > -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) > +static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) > { > - struct nvme_queue *nvmeq = hctx->driver_data; > - > if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { > spin_lock_irq(&nvmeq->q_lock); > __nvme_process_cq(nvmeq, &tag); > @@ -945,6 +943,13 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) > return 0; > } > > +static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) > +{ > + struct nvme_queue *nvmeq = hctx->driver_data; > + > + return __nvme_poll(nvmeq, tag); > +} > + > static void nvme_async_event_work(struct work_struct *work) > { > struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work); > @@ -1045,6 +1050,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) > struct request *abort_req; > struct nvme_command cmd; > > + /* > + * Did we miss an interrupt? > + */ > + if (__nvme_poll(nvmeq, req->tag)) { > + dev_warn(dev->dev, > + "I/O %d QID %d timeout, completion polled\n", > + req->tag, nvmeq->qid); > + return BLK_EH_HANDLED; > + } > + > /* > * Shutdown immediately if controller times out while starting. The > * reset work will see the pci device disabled when it gets the forced >
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3752052ae20a..5d2a7ee2f922 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -929,10 +929,8 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_NONE; } -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) { - struct nvme_queue *nvmeq = hctx->driver_data; - if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { spin_lock_irq(&nvmeq->q_lock); __nvme_process_cq(nvmeq, &tag); @@ -945,6 +943,13 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } +static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + return __nvme_poll(nvmeq, tag); +} + static void nvme_async_event_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work); @@ -1045,6 +1050,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct request *abort_req; struct nvme_command cmd; + /* + * Did we miss an interrupt? + */ + if (__nvme_poll(nvmeq, req->tag)) { + dev_warn(dev->dev, + "I/O %d QID %d timeout, completion polled\n", + req->tag, nvmeq->qid); + return BLK_EH_HANDLED; + } + /* * Shutdown immediately if controller times out while starting. The * reset work will see the pci device disabled when it gets the forced