Message ID | 20180503214900.25332-1-dann.frazier@canonical.com |
---|---|
State | New |
Headers | show |
Series | [SRU,Bionic] scsi: libsas: defer ata device eh commands to libata | expand |
On 05/03/18 23:49, dann frazier wrote: > From: Jason Yan <yanaijie@huawei.com> > > BugLink: https://bugs.launchpad.net/bugs/1768971 > > When ata device doing EH, some commands still attached with tasks are > not passed to libata when abort failed or recover failed, so libata did > not handle these commands. After these commands done, sas task is freed, > but ata qc is not freed. This will cause ata qc leak and trigger a > warning like below: > > WARNING: CPU: 0 PID: 28512 at drivers/ata/libata-eh.c:4037 > ata_eh_finish+0xb4/0xcc > CPU: 0 PID: 28512 Comm: kworker/u32:2 Tainted: G W OE 4.14.0#1 > ...... > Call trace: > [<ffff0000088b7bd0>] ata_eh_finish+0xb4/0xcc > [<ffff0000088b8420>] ata_do_eh+0xc4/0xd8 > [<ffff0000088b8478>] ata_std_error_handler+0x44/0x8c > [<ffff0000088b8068>] ata_scsi_port_error_handler+0x480/0x694 > [<ffff000008875fc4>] async_sas_ata_eh+0x4c/0x80 > [<ffff0000080f6be8>] async_run_entry_fn+0x4c/0x170 > [<ffff0000080ebd70>] process_one_work+0x144/0x390 > [<ffff0000080ec100>] worker_thread+0x144/0x418 > [<ffff0000080f2c98>] kthread+0x10c/0x138 > [<ffff0000080855dc>] ret_from_fork+0x10/0x18 > > If ata qc leaked too many, ata tag allocation will fail and io blocked > for ever. > > As suggested by Dan Williams, defer ata device commands to libata and > merge sas_eh_finish_cmd() with sas_eh_defer_cmd(). libata will handle > ata qcs correctly after this. > > Signed-off-by: Jason Yan <yanaijie@huawei.com> > CC: Xiaofei Tan <tanxiaofei@huawei.com> > CC: John Garry <john.garry@huawei.com> > CC: Dan Williams <dan.j.williams@intel.com> > Reviewed-by: Dan Williams <dan.j.williams@intel.com> > Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> > (cherry picked from commit 318aaf34f1179b39fa9c30fa0f3288b645beee39) > Signed-off-by: dann frazier <dann.frazier@canonical.com> > --- > drivers/scsi/libsas/sas_scsi_host.c | 33 ++++++++++++----------------- > 1 file changed, 13 insertions(+), 20 deletions(-) > > diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c > index a68caa0d3fb5..00902c9a34b8 100644 > --- a/drivers/scsi/libsas/sas_scsi_host.c > +++ b/drivers/scsi/libsas/sas_scsi_host.c > @@ -222,6 +222,7 @@ int sas_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) > static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) > { > struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(cmd->device->host); > + struct domain_device *dev = cmd_to_domain_dev(cmd); > struct sas_task *task = TO_SAS_TASK(cmd); > > /* At this point, we only get called following an actual abort > @@ -230,6 +231,14 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) > */ > sas_end_task(cmd, task); > > + if (dev_is_sata(dev)) { > + /* defer commands to libata so that libata EH can > + * handle ata qcs correctly > + */ > + list_move_tail(&cmd->eh_entry, &sas_ha->eh_ata_q); > + return; > + } > + > /* now finish the command and move it on to the error > * handler done list, this also takes it off the > * error handler pending list. > @@ -237,22 +246,6 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) > scsi_eh_finish_cmd(cmd, &sas_ha->eh_done_q); > } > > -static void sas_eh_defer_cmd(struct scsi_cmnd *cmd) > -{ > - struct domain_device *dev = cmd_to_domain_dev(cmd); > - struct sas_ha_struct *ha = dev->port->ha; > - struct sas_task *task = TO_SAS_TASK(cmd); > - > - if (!dev_is_sata(dev)) { > - sas_eh_finish_cmd(cmd); > - return; > - } > - > - /* report the timeout to libata */ > - sas_end_task(cmd, task); > - list_move_tail(&cmd->eh_entry, &ha->eh_ata_q); > -} > - > static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd *my_cmd) > { > struct scsi_cmnd *cmd, *n; > @@ -260,7 +253,7 @@ static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd > list_for_each_entry_safe(cmd, n, error_q, eh_entry) { > if (cmd->device->sdev_target == my_cmd->device->sdev_target && > cmd->device->lun == my_cmd->device->lun) > - sas_eh_defer_cmd(cmd); > + sas_eh_finish_cmd(cmd); > } > } > > @@ -633,12 +626,12 @@ static void sas_eh_handle_sas_errors(struct Scsi_Host *shost, struct list_head * > case TASK_IS_DONE: > SAS_DPRINTK("%s: task 0x%p is done\n", __func__, > task); > - sas_eh_defer_cmd(cmd); > + sas_eh_finish_cmd(cmd); > continue; > case TASK_IS_ABORTED: > SAS_DPRINTK("%s: task 0x%p is aborted\n", > __func__, task); > - sas_eh_defer_cmd(cmd); > + sas_eh_finish_cmd(cmd); > continue; > case TASK_IS_AT_LU: > SAS_DPRINTK("task 0x%p is at LU: lu recover\n", task); > @@ -649,7 +642,7 @@ static void sas_eh_handle_sas_errors(struct Scsi_Host *shost, struct list_head * > "recovered\n", > SAS_ADDR(task->dev), > cmd->device->lun); > - sas_eh_defer_cmd(cmd); > + sas_eh_finish_cmd(cmd); > sas_scsi_clear_queue_lu(work_q, cmd); > goto Again; > } > Hi Dann, Same here, could you please add a "[Fix]" section on the bug description? Acked-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> Thanks, Kleber
On Tue, May 8, 2018 at 9:12 AM, Kleber Souza <kleber.souza@canonical.com> wrote: > On 05/03/18 23:49, dann frazier wrote: >> From: Jason Yan <yanaijie@huawei.com> >> >> BugLink: https://bugs.launchpad.net/bugs/1768971 >> >> When ata device doing EH, some commands still attached with tasks are >> not passed to libata when abort failed or recover failed, so libata did >> not handle these commands. After these commands done, sas task is freed, >> but ata qc is not freed. This will cause ata qc leak and trigger a >> warning like below: >> >> WARNING: CPU: 0 PID: 28512 at drivers/ata/libata-eh.c:4037 >> ata_eh_finish+0xb4/0xcc >> CPU: 0 PID: 28512 Comm: kworker/u32:2 Tainted: G W OE 4.14.0#1 >> ...... >> Call trace: >> [<ffff0000088b7bd0>] ata_eh_finish+0xb4/0xcc >> [<ffff0000088b8420>] ata_do_eh+0xc4/0xd8 >> [<ffff0000088b8478>] ata_std_error_handler+0x44/0x8c >> [<ffff0000088b8068>] ata_scsi_port_error_handler+0x480/0x694 >> [<ffff000008875fc4>] async_sas_ata_eh+0x4c/0x80 >> [<ffff0000080f6be8>] async_run_entry_fn+0x4c/0x170 >> [<ffff0000080ebd70>] process_one_work+0x144/0x390 >> [<ffff0000080ec100>] worker_thread+0x144/0x418 >> [<ffff0000080f2c98>] kthread+0x10c/0x138 >> [<ffff0000080855dc>] ret_from_fork+0x10/0x18 >> >> If ata qc leaked too many, ata tag allocation will fail and io blocked >> for ever. >> >> As suggested by Dan Williams, defer ata device commands to libata and >> merge sas_eh_finish_cmd() with sas_eh_defer_cmd(). libata will handle >> ata qcs correctly after this. >> >> Signed-off-by: Jason Yan <yanaijie@huawei.com> >> CC: Xiaofei Tan <tanxiaofei@huawei.com> >> CC: John Garry <john.garry@huawei.com> >> CC: Dan Williams <dan.j.williams@intel.com> >> Reviewed-by: Dan Williams <dan.j.williams@intel.com> >> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> >> (cherry picked from commit 318aaf34f1179b39fa9c30fa0f3288b645beee39) >> Signed-off-by: dann frazier <dann.frazier@canonical.com> >> --- >> drivers/scsi/libsas/sas_scsi_host.c | 33 ++++++++++++----------------- >> 1 file changed, 13 insertions(+), 20 deletions(-) >> >> diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c >> index a68caa0d3fb5..00902c9a34b8 100644 >> --- a/drivers/scsi/libsas/sas_scsi_host.c >> +++ b/drivers/scsi/libsas/sas_scsi_host.c >> @@ -222,6 +222,7 @@ int sas_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) >> static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) >> { >> struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(cmd->device->host); >> + struct domain_device *dev = cmd_to_domain_dev(cmd); >> struct sas_task *task = TO_SAS_TASK(cmd); >> >> /* At this point, we only get called following an actual abort >> @@ -230,6 +231,14 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) >> */ >> sas_end_task(cmd, task); >> >> + if (dev_is_sata(dev)) { >> + /* defer commands to libata so that libata EH can >> + * handle ata qcs correctly >> + */ >> + list_move_tail(&cmd->eh_entry, &sas_ha->eh_ata_q); >> + return; >> + } >> + >> /* now finish the command and move it on to the error >> * handler done list, this also takes it off the >> * error handler pending list. >> @@ -237,22 +246,6 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) >> scsi_eh_finish_cmd(cmd, &sas_ha->eh_done_q); >> } >> >> -static void sas_eh_defer_cmd(struct scsi_cmnd *cmd) >> -{ >> - struct domain_device *dev = cmd_to_domain_dev(cmd); >> - struct sas_ha_struct *ha = dev->port->ha; >> - struct sas_task *task = TO_SAS_TASK(cmd); >> - >> - if (!dev_is_sata(dev)) { >> - sas_eh_finish_cmd(cmd); >> - return; >> - } >> - >> - /* report the timeout to libata */ >> - sas_end_task(cmd, task); >> - list_move_tail(&cmd->eh_entry, &ha->eh_ata_q); >> -} >> - >> static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd *my_cmd) >> { >> struct scsi_cmnd *cmd, *n; >> @@ -260,7 +253,7 @@ static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd >> list_for_each_entry_safe(cmd, n, error_q, eh_entry) { >> if (cmd->device->sdev_target == my_cmd->device->sdev_target && >> cmd->device->lun == my_cmd->device->lun) >> - sas_eh_defer_cmd(cmd); >> + sas_eh_finish_cmd(cmd); >> } >> } >> >> @@ -633,12 +626,12 @@ static void sas_eh_handle_sas_errors(struct Scsi_Host *shost, struct list_head * >> case TASK_IS_DONE: >> SAS_DPRINTK("%s: task 0x%p is done\n", __func__, >> task); >> - sas_eh_defer_cmd(cmd); >> + sas_eh_finish_cmd(cmd); >> continue; >> case TASK_IS_ABORTED: >> SAS_DPRINTK("%s: task 0x%p is aborted\n", >> __func__, task); >> - sas_eh_defer_cmd(cmd); >> + sas_eh_finish_cmd(cmd); >> continue; >> case TASK_IS_AT_LU: >> SAS_DPRINTK("task 0x%p is at LU: lu recover\n", task); >> @@ -649,7 +642,7 @@ static void sas_eh_handle_sas_errors(struct Scsi_Host *shost, struct list_head * >> "recovered\n", >> SAS_ADDR(task->dev), >> cmd->device->lun); >> - sas_eh_defer_cmd(cmd); >> + sas_eh_finish_cmd(cmd); >> sas_scsi_clear_queue_lu(work_q, cmd); >> goto Again; >> } >> > > Hi Dann, > > Same here, could you please add a "[Fix]" section on the bug description? > > Acked-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> Done, ta! -dann
On 03.05.2018 23:49, dann frazier wrote: > From: Jason Yan <yanaijie@huawei.com> > > BugLink: https://bugs.launchpad.net/bugs/1768971 > > When ata device doing EH, some commands still attached with tasks are > not passed to libata when abort failed or recover failed, so libata did > not handle these commands. After these commands done, sas task is freed, > but ata qc is not freed. This will cause ata qc leak and trigger a > warning like below: > > WARNING: CPU: 0 PID: 28512 at drivers/ata/libata-eh.c:4037 > ata_eh_finish+0xb4/0xcc > CPU: 0 PID: 28512 Comm: kworker/u32:2 Tainted: G W OE 4.14.0#1 > ...... > Call trace: > [<ffff0000088b7bd0>] ata_eh_finish+0xb4/0xcc > [<ffff0000088b8420>] ata_do_eh+0xc4/0xd8 > [<ffff0000088b8478>] ata_std_error_handler+0x44/0x8c > [<ffff0000088b8068>] ata_scsi_port_error_handler+0x480/0x694 > [<ffff000008875fc4>] async_sas_ata_eh+0x4c/0x80 > [<ffff0000080f6be8>] async_run_entry_fn+0x4c/0x170 > [<ffff0000080ebd70>] process_one_work+0x144/0x390 > [<ffff0000080ec100>] worker_thread+0x144/0x418 > [<ffff0000080f2c98>] kthread+0x10c/0x138 > [<ffff0000080855dc>] ret_from_fork+0x10/0x18 > > If ata qc leaked too many, ata tag allocation will fail and io blocked > for ever. > > As suggested by Dan Williams, defer ata device commands to libata and > merge sas_eh_finish_cmd() with sas_eh_defer_cmd(). libata will handle > ata qcs correctly after this. > > Signed-off-by: Jason Yan <yanaijie@huawei.com> > CC: Xiaofei Tan <tanxiaofei@huawei.com> > CC: John Garry <john.garry@huawei.com> > CC: Dan Williams <dan.j.williams@intel.com> > Reviewed-by: Dan Williams <dan.j.williams@intel.com> > Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> > (cherry picked from commit 318aaf34f1179b39fa9c30fa0f3288b645beee39) > Signed-off-by: dann frazier <dann.frazier@canonical.com> Acked-by: Stefan Bader <stefan.bader@canonical.com> > --- > drivers/scsi/libsas/sas_scsi_host.c | 33 ++++++++++++----------------- > 1 file changed, 13 insertions(+), 20 deletions(-) > > diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c > index a68caa0d3fb5..00902c9a34b8 100644 > --- a/drivers/scsi/libsas/sas_scsi_host.c > +++ b/drivers/scsi/libsas/sas_scsi_host.c > @@ -222,6 +222,7 @@ int sas_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) > static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) > { > struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(cmd->device->host); > + struct domain_device *dev = cmd_to_domain_dev(cmd); > struct sas_task *task = TO_SAS_TASK(cmd); > > /* At this point, we only get called following an actual abort > @@ -230,6 +231,14 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) > */ > sas_end_task(cmd, task); > > + if (dev_is_sata(dev)) { > + /* defer commands to libata so that libata EH can > + * handle ata qcs correctly > + */ > + list_move_tail(&cmd->eh_entry, &sas_ha->eh_ata_q); > + return; > + } > + > /* now finish the command and move it on to the error > * handler done list, this also takes it off the > * error handler pending list. > @@ -237,22 +246,6 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) > scsi_eh_finish_cmd(cmd, &sas_ha->eh_done_q); > } > > -static void sas_eh_defer_cmd(struct scsi_cmnd *cmd) > -{ > - struct domain_device *dev = cmd_to_domain_dev(cmd); > - struct sas_ha_struct *ha = dev->port->ha; > - struct sas_task *task = TO_SAS_TASK(cmd); > - > - if (!dev_is_sata(dev)) { > - sas_eh_finish_cmd(cmd); > - return; > - } > - > - /* report the timeout to libata */ > - sas_end_task(cmd, task); > - list_move_tail(&cmd->eh_entry, &ha->eh_ata_q); > -} > - > static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd *my_cmd) > { > struct scsi_cmnd *cmd, *n; > @@ -260,7 +253,7 @@ static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd > list_for_each_entry_safe(cmd, n, error_q, eh_entry) { > if (cmd->device->sdev_target == my_cmd->device->sdev_target && > cmd->device->lun == my_cmd->device->lun) > - sas_eh_defer_cmd(cmd); > + sas_eh_finish_cmd(cmd); > } > } > > @@ -633,12 +626,12 @@ static void sas_eh_handle_sas_errors(struct Scsi_Host *shost, struct list_head * > case TASK_IS_DONE: > SAS_DPRINTK("%s: task 0x%p is done\n", __func__, > task); > - sas_eh_defer_cmd(cmd); > + sas_eh_finish_cmd(cmd); > continue; > case TASK_IS_ABORTED: > SAS_DPRINTK("%s: task 0x%p is aborted\n", > __func__, task); > - sas_eh_defer_cmd(cmd); > + sas_eh_finish_cmd(cmd); > continue; > case TASK_IS_AT_LU: > SAS_DPRINTK("task 0x%p is at LU: lu recover\n", task); > @@ -649,7 +642,7 @@ static void sas_eh_handle_sas_errors(struct Scsi_Host *shost, struct list_head * > "recovered\n", > SAS_ADDR(task->dev), > cmd->device->lun); > - sas_eh_defer_cmd(cmd); > + sas_eh_finish_cmd(cmd); > sas_scsi_clear_queue_lu(work_q, cmd); > goto Again; > } >
Applied to bionic master-next. -Stefan
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index a68caa0d3fb5..00902c9a34b8 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -222,6 +222,7 @@ int sas_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) { struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(cmd->device->host); + struct domain_device *dev = cmd_to_domain_dev(cmd); struct sas_task *task = TO_SAS_TASK(cmd); /* At this point, we only get called following an actual abort @@ -230,6 +231,14 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) */ sas_end_task(cmd, task); + if (dev_is_sata(dev)) { + /* defer commands to libata so that libata EH can + * handle ata qcs correctly + */ + list_move_tail(&cmd->eh_entry, &sas_ha->eh_ata_q); + return; + } + /* now finish the command and move it on to the error * handler done list, this also takes it off the * error handler pending list. @@ -237,22 +246,6 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd) scsi_eh_finish_cmd(cmd, &sas_ha->eh_done_q); } -static void sas_eh_defer_cmd(struct scsi_cmnd *cmd) -{ - struct domain_device *dev = cmd_to_domain_dev(cmd); - struct sas_ha_struct *ha = dev->port->ha; - struct sas_task *task = TO_SAS_TASK(cmd); - - if (!dev_is_sata(dev)) { - sas_eh_finish_cmd(cmd); - return; - } - - /* report the timeout to libata */ - sas_end_task(cmd, task); - list_move_tail(&cmd->eh_entry, &ha->eh_ata_q); -} - static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd *my_cmd) { struct scsi_cmnd *cmd, *n; @@ -260,7 +253,7 @@ static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd list_for_each_entry_safe(cmd, n, error_q, eh_entry) { if (cmd->device->sdev_target == my_cmd->device->sdev_target && cmd->device->lun == my_cmd->device->lun) - sas_eh_defer_cmd(cmd); + sas_eh_finish_cmd(cmd); } } @@ -633,12 +626,12 @@ static void sas_eh_handle_sas_errors(struct Scsi_Host *shost, struct list_head * case TASK_IS_DONE: SAS_DPRINTK("%s: task 0x%p is done\n", __func__, task); - sas_eh_defer_cmd(cmd); + sas_eh_finish_cmd(cmd); continue; case TASK_IS_ABORTED: SAS_DPRINTK("%s: task 0x%p is aborted\n", __func__, task); - sas_eh_defer_cmd(cmd); + sas_eh_finish_cmd(cmd); continue; case TASK_IS_AT_LU: SAS_DPRINTK("task 0x%p is at LU: lu recover\n", task); @@ -649,7 +642,7 @@ static void sas_eh_handle_sas_errors(struct Scsi_Host *shost, struct list_head * "recovered\n", SAS_ADDR(task->dev), cmd->device->lun); - sas_eh_defer_cmd(cmd); + sas_eh_finish_cmd(cmd); sas_scsi_clear_queue_lu(work_q, cmd); goto Again; }