diff mbox series

vhost-user-scsi: support reconnect to backend

Message ID 20230721105205.1714449-1-fengli@smartx.com
State New
Headers show
Series vhost-user-scsi: support reconnect to backend | expand

Commit Message

Li Feng July 21, 2023, 10:51 a.m. UTC
If the backend crashes and restarts, the device is broken.
This patch adds reconnect for vhost-user-scsi.

Tested with spdk backend.

Signed-off-by: Li Feng <fengli@smartx.com>
---
 hw/block/vhost-user-blk.c           |   2 -
 hw/scsi/vhost-scsi-common.c         |  27 ++---
 hw/scsi/vhost-user-scsi.c           | 163 +++++++++++++++++++++++++---
 include/hw/virtio/vhost-user-scsi.h |   3 +
 include/hw/virtio/vhost.h           |   2 +
 5 files changed, 165 insertions(+), 32 deletions(-)

Comments

Raphael Norwitz July 24, 2023, 5:21 p.m. UTC | #1
Very excited to see this. High level looks good modulo a few small things.

My major concern is around existing vhost-user-scsi backends which don’t support VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD. IMO we should hide the reconnect behavior behind a VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD check. We may want to do the same for vhost-user-blk.

The question is then what happens if the check is false. IIUC without an inflight FD, if a device processes requests out of order, it’s not safe to continue execution on reconnect, as there’s no way for the backend to know how to replay IO. Should we permanently wedge the device or have QEMU fail out? May be nice to have a toggle for this.

> On Jul 21, 2023, at 6:51 AM, Li Feng <fengli@smartx.com> wrote:
> 
> If the backend crashes and restarts, the device is broken.
> This patch adds reconnect for vhost-user-scsi.
> 
> Tested with spdk backend.
> 
> Signed-off-by: Li Feng <fengli@smartx.com>
> ---
> hw/block/vhost-user-blk.c           |   2 -
> hw/scsi/vhost-scsi-common.c         |  27 ++---
> hw/scsi/vhost-user-scsi.c           | 163 +++++++++++++++++++++++++---
> include/hw/virtio/vhost-user-scsi.h |   3 +
> include/hw/virtio/vhost.h           |   2 +
> 5 files changed, 165 insertions(+), 32 deletions(-)
> 
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index eecf3f7a81..f250c740b5 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -32,8 +32,6 @@
> #include "sysemu/sysemu.h"
> #include "sysemu/runstate.h"
> 
> -#define REALIZE_CONNECTION_RETRIES 3
> -
> static const int user_feature_bits[] = {
>     VIRTIO_BLK_F_SIZE_MAX,
>     VIRTIO_BLK_F_SEG_MAX,
> diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c

Why can’t all the vhost-scsi-common stuff be moved to a separate change?

Especially the stuff introduced for vhost-user-blk in 1b0063b3048af65dfaae6422a572c87db8575a92 should be moved out.

> index a06f01af26..08801886b8 100644
> --- a/hw/scsi/vhost-scsi-common.c
> +++ b/hw/scsi/vhost-scsi-common.c
> @@ -52,16 +52,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
> 
>     vsc->dev.acked_features = vdev->guest_features;
> 
> -    assert(vsc->inflight == NULL);
> -    vsc->inflight = g_new0(struct vhost_inflight, 1);
> -    ret = vhost_dev_get_inflight(&vsc->dev,
> -                                 vs->conf.virtqueue_size,
> -                                 vsc->inflight);
> +    ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
>     if (ret < 0) {
> -        error_report("Error get inflight: %d", -ret);
> +        error_report("Error setting inflight format: %d", -ret);
>         goto err_guest_notifiers;
>     }
> 
> +    if (!vsc->inflight->addr) {
> +        ret = vhost_dev_get_inflight(&vsc->dev,
> +                                    vs->conf.virtqueue_size,
> +                                    vsc->inflight);
> +        if (ret < 0) {
> +            error_report("Error get inflight: %d", -ret);
> +            goto err_guest_notifiers;
> +        }
> +    }
> +
>     ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
>     if (ret < 0) {
>         error_report("Error set inflight: %d", -ret);
> @@ -85,9 +91,6 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>     return ret;
> 
> err_guest_notifiers:
> -    g_free(vsc->inflight);
> -    vsc->inflight = NULL;
> -
>     k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false);
> err_host_notifiers:
>     vhost_dev_disable_notifiers(&vsc->dev, vdev);
> @@ -111,12 +114,6 @@ void vhost_scsi_common_stop(VHostSCSICommon *vsc)
>     }
>     assert(ret >= 0);
> 

In the vhost-scsi (kernel backend) path, what will cleanup vsc->inflight now?

> -    if (vsc->inflight) {
> -        vhost_dev_free_inflight(vsc->inflight);
> -        g_free(vsc->inflight);
> -        vsc->inflight = NULL;
> -    }
> -
>     vhost_dev_disable_notifiers(&vsc->dev, vdev);
> }
> 
> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
> index ee99b19e7a..e0e88b0c42 100644
> --- a/hw/scsi/vhost-user-scsi.c
> +++ b/hw/scsi/vhost-user-scsi.c
> @@ -89,14 +89,126 @@ static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
> {
> }
> 
> +static int vhost_user_scsi_connect(DeviceState *dev, Error **errp)
> +{
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
> +    int ret = 0;
> +
> +    if (s->connected) {
> +        return 0;
> +    }
> +    s->connected = true;
> +
> +    vsc->dev.num_queues = vs->conf.num_queues;
> +    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
> +    vsc->dev.vqs = s->vhost_vqs;
> +    vsc->dev.vq_index = 0;
> +    vsc->dev.backend_features = 0;
> +
> +    ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
> +                         errp);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /* restore vhost state */

Should this use virtio_device_should_start like vhost_user_blk?

> +    if (virtio_device_started(vdev, vdev->status)) {
> +        ret = vhost_scsi_common_start(vsc);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event);
> +
> +static void vhost_user_scsi_disconnect(DeviceState *dev)
> +{
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
> +

I don’t think we want to execute vhost_scsi_common_stop() if the device hasn’t been started. I remember that caused a number of races with the vhost_user_blk connecting/disconnecting on startup.

Let’s add a similar started_vu check?

> +    if (!s->connected) {
> +        return;
> +    }
> +    s->connected = false;
> +
> +    vhost_scsi_common_stop(vsc);
> +
> +    vhost_dev_cleanup(&vsc->dev);
> +
> +    /* Re-instate the event handler for new connections */
> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
> +                             vhost_user_scsi_event, NULL, dev, NULL, true);
> +}
> +
> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event)
> +{
> +    DeviceState *dev = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
> +    Error *local_err = NULL;
> +
> +    switch (event) {
> +    case CHR_EVENT_OPENED:
> +        if (vhost_user_scsi_connect(dev, &local_err) < 0) {
> +            error_report_err(local_err);
> +            qemu_chr_fe_disconnect(&vs->conf.chardev);
> +            return;
> +        }
> +        break;
> +    case CHR_EVENT_CLOSED:
> +        /* defer close until later to avoid circular close */
> +        vhost_user_async_close(dev, &vs->conf.chardev, &vsc->dev,
> +                               vhost_user_scsi_disconnect);
> +        break;
> +    case CHR_EVENT_BREAK:
> +    case CHR_EVENT_MUX_IN:
> +    case CHR_EVENT_MUX_OUT:
> +        /* Ignore */
> +        break;
> +    }
> +}
> +
> +static int vhost_user_scsi_realize_connect(VHostUserSCSI *s, Error **errp)
> +{
> +    DeviceState *dev = &s->parent_obj.parent_obj.parent_obj.parent_obj;
> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
> +    int ret;
> +
> +    s->connected = false;
> +
> +    ret = qemu_chr_fe_wait_connected(&vs->conf.chardev, errp);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = vhost_user_scsi_connect(dev, errp);
> +    if (ret < 0) {
> +        qemu_chr_fe_disconnect(&vs->conf.chardev);
> +        return ret;
> +    }
> +    assert(s->connected);
> +
> +    return 0;
> +}
> +
> static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
> {
>     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>     VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>     VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
> -    struct vhost_virtqueue *vqs = NULL;
>     Error *err = NULL;
>     int ret;
> +    int retries = REALIZE_CONNECTION_RETRIES;
> 
>     if (!vs->conf.chardev.chr) {
>         error_setg(errp, "vhost-user-scsi: missing chardev");
> @@ -112,21 +224,31 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>     }
> 
>     if (!vhost_user_init(&s->vhost_user, &vs->conf.chardev, errp)) {

Why execute vhost_user_cleanup() if vhost_user_init() fails?

> -        goto free_virtio;
> +        goto free_vhost;
>     }
> 
> -    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
> -    vsc->dev.vqs = g_new0(struct vhost_virtqueue, vsc->dev.nvqs);
> -    vsc->dev.vq_index = 0;
> -    vsc->dev.backend_features = 0;
> -    vqs = vsc->dev.vqs;
> +    vsc->inflight = g_new0(struct vhost_inflight, 1);
> +    s->vhost_vqs = g_new0(struct vhost_virtqueue,
> +                          VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues);
> +
> +    assert(!*errp);
> +    do {
> +        if (*errp) {
> +            error_prepend(errp, "Reconnecting after error: ");
> +            error_report_err(*errp);
> +            *errp = NULL;
> +        }
> +        ret = vhost_user_scsi_realize_connect(s, errp);
> +    } while (ret < 0 && retries--);
> 
> -    ret = vhost_dev_init(&vsc->dev, &s->vhost_user,
> -                         VHOST_BACKEND_TYPE_USER, 0, errp);
>     if (ret < 0) {
> -        goto free_vhost;
> +        goto free_vqs;
>     }
> 
> +    /* we're fully initialized, now we can operate, so add the handler */
> +    qemu_chr_fe_set_handlers(&vs->conf.chardev,  NULL, NULL,
> +                             vhost_user_scsi_event, NULL, (void *)dev,
> +                             NULL, true);
>     /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
>     vsc->channel = 0;
>     vsc->lun = 0;
> @@ -134,10 +256,15 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
> 
>     return;
> 
> +free_vqs:
> +    g_free(s->vhost_vqs);
> +    s->vhost_vqs = NULL;
> +    g_free(vsc->inflight);
> +    vsc->inflight = NULL;
> +
> free_vhost:
>     vhost_user_cleanup(&s->vhost_user);
> -    g_free(vqs);
> -free_virtio:
> +
>     virtio_scsi_common_unrealize(dev);
> }
> 
> @@ -146,16 +273,22 @@ static void vhost_user_scsi_unrealize(DeviceState *dev)
>     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>     VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>     VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
> -    struct vhost_virtqueue *vqs = vsc->dev.vqs;
> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
> 
>     /* This will stop the vhost backend. */
>     vhost_user_scsi_set_status(vdev, 0);
> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL, NULL, NULL, NULL,
> +                             NULL, false);
> 
>     vhost_dev_cleanup(&vsc->dev);
> -    g_free(vqs);

Nit: Why not put vhost_dev_free_inflight next to the remaining inflight cleanup?

> +    vhost_dev_free_inflight(vsc->inflight);
> +    g_free(s->vhost_vqs);
> +    s->vhost_vqs = NULL;
> +    g_free(vsc->inflight);
> +    vsc->inflight = NULL;
> 

Curiosity - why reorder here? Is something in vhost_user_cleanup() dependent on state freed in virtio_scsi_common_unrealize()?

If so, should that go as a standalone fix?

> -    virtio_scsi_common_unrealize(dev);
>     vhost_user_cleanup(&s->vhost_user);
> +    virtio_scsi_common_unrealize(dev);
> }
> 
> static Property vhost_user_scsi_properties[] = {
> diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
> index 521b08e559..c66acc68b7 100644
> --- a/include/hw/virtio/vhost-user-scsi.h
> +++ b/include/hw/virtio/vhost-user-scsi.h
> @@ -29,6 +29,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VHostUserSCSI, VHOST_USER_SCSI)
> struct VHostUserSCSI {
>     VHostSCSICommon parent_obj;
>     VhostUserState vhost_user;

See above - we should probably have started_vu here/

Maybe we should have some shared struct with vhost_user_blk for connectivity params?

> +    bool connected;
> +
> +    struct vhost_virtqueue *vhost_vqs;
> };
> 
> #endif /* VHOST_USER_SCSI_H */
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 6a173cb9fa..b904346fe1 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -8,6 +8,8 @@
> #define VHOST_F_DEVICE_IOTLB 63
> #define VHOST_USER_F_PROTOCOL_FEATURES 30
> 

Should the macro name indicate that this is for vhost-user?

VU_REALIZE_CONN_RETRIES? 

> +#define REALIZE_CONNECTION_RETRIES 3
> +
> /* Generic structures common for any vhost based device. */
> 
> struct vhost_inflight {
> -- 
> 2.41.0
>
Michael S. Tsirkin July 24, 2023, 8:30 p.m. UTC | #2
On Mon, Jul 24, 2023 at 05:21:37PM +0000, Raphael Norwitz wrote:
> Very excited to see this. High level looks good modulo a few small things.
> 
> My major concern is around existing vhost-user-scsi backends which don’t support VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD. IMO we should hide the reconnect behavior behind a VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD check. We may want to do the same for vhost-user-blk.
> 
> The question is then what happens if the check is false. IIUC without an inflight FD, if a device processes requests out of order, it’s not safe to continue execution on reconnect, as there’s no way for the backend to know how to replay IO. Should we permanently wedge the device or have QEMU fail out? May be nice to have a toggle for this.

No, device itself can store the state somewhere. And if it wants to,
it can check VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD and fail reconnect.
Li Feng July 25, 2023, 10:19 a.m. UTC | #3
Thanks for your comments.

> 2023年7月25日 上午1:21,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
> 
> Very excited to see this. High level looks good modulo a few small things.
> 
> My major concern is around existing vhost-user-scsi backends which don’t support VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD. IMO we should hide the reconnect behavior behind a VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD check. We may want to do the same for vhost-user-blk.
> 
> The question is then what happens if the check is false. IIUC without an inflight FD, if a device processes requests out of order, it’s not safe to continue execution on reconnect, as there’s no way for the backend to know how to replay IO. Should we permanently wedge the device or have QEMU fail out? May be nice to have a toggle for this.

Based on what MST said, is there anything else I need to do?
> 
>> On Jul 21, 2023, at 6:51 AM, Li Feng <fengli@smartx.com> wrote:
>> 
>> If the backend crashes and restarts, the device is broken.
>> This patch adds reconnect for vhost-user-scsi.
>> 
>> Tested with spdk backend.
>> 
>> Signed-off-by: Li Feng <fengli@smartx.com>
>> ---
>> hw/block/vhost-user-blk.c           |   2 -
>> hw/scsi/vhost-scsi-common.c         |  27 ++---
>> hw/scsi/vhost-user-scsi.c           | 163 +++++++++++++++++++++++++---
>> include/hw/virtio/vhost-user-scsi.h |   3 +
>> include/hw/virtio/vhost.h           |   2 +
>> 5 files changed, 165 insertions(+), 32 deletions(-)
>> 
>> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>> index eecf3f7a81..f250c740b5 100644
>> --- a/hw/block/vhost-user-blk.c
>> +++ b/hw/block/vhost-user-blk.c
>> @@ -32,8 +32,6 @@
>> #include "sysemu/sysemu.h"
>> #include "sysemu/runstate.h"
>> 
>> -#define REALIZE_CONNECTION_RETRIES 3
>> -
>> static const int user_feature_bits[] = {
>>    VIRTIO_BLK_F_SIZE_MAX,
>>    VIRTIO_BLK_F_SEG_MAX,
>> diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
> 
> Why can’t all the vhost-scsi-common stuff be moved to a separate change?

I will move this code to separate patch.
> 
> Especially the stuff introduced for vhost-user-blk in 1b0063b3048af65dfaae6422a572c87db8575a92 should be moved out.
OK.

> 
>> index a06f01af26..08801886b8 100644
>> --- a/hw/scsi/vhost-scsi-common.c
>> +++ b/hw/scsi/vhost-scsi-common.c
>> @@ -52,16 +52,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>> 
>>    vsc->dev.acked_features = vdev->guest_features;
>> 
>> -    assert(vsc->inflight == NULL);
>> -    vsc->inflight = g_new0(struct vhost_inflight, 1);
>> -    ret = vhost_dev_get_inflight(&vsc->dev,
>> -                                 vs->conf.virtqueue_size,
>> -                                 vsc->inflight);
>> +    ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
>>    if (ret < 0) {
>> -        error_report("Error get inflight: %d", -ret);
>> +        error_report("Error setting inflight format: %d", -ret);
>>        goto err_guest_notifiers;
>>    }
>> 
>> +    if (!vsc->inflight->addr) {
>> +        ret = vhost_dev_get_inflight(&vsc->dev,
>> +                                    vs->conf.virtqueue_size,
>> +                                    vsc->inflight);
>> +        if (ret < 0) {
>> +            error_report("Error get inflight: %d", -ret);
>> +            goto err_guest_notifiers;
>> +        }
>> +    }
>> +
>>    ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
>>    if (ret < 0) {
>>        error_report("Error set inflight: %d", -ret);
>> @@ -85,9 +91,6 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>    return ret;
>> 
>> err_guest_notifiers:
>> -    g_free(vsc->inflight);
>> -    vsc->inflight = NULL;
>> -
>>    k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false);
>> err_host_notifiers:
>>    vhost_dev_disable_notifiers(&vsc->dev, vdev);
>> @@ -111,12 +114,6 @@ void vhost_scsi_common_stop(VHostSCSICommon *vsc)
>>    }
>>    assert(ret >= 0);
>> 
> 
> In the vhost-scsi (kernel backend) path, what will cleanup vsc->inflight now?
OK, we should check the vsc->inflight if it is null, the vhost-scsi doesn’t allocate the
inflight object memory.

> 
>> -    if (vsc->inflight) {
>> -        vhost_dev_free_inflight(vsc->inflight);
>> -        g_free(vsc->inflight);
>> -        vsc->inflight = NULL;
>> -    }
>> -
>>    vhost_dev_disable_notifiers(&vsc->dev, vdev);
>> }
>> 
>> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
>> index ee99b19e7a..e0e88b0c42 100644
>> --- a/hw/scsi/vhost-user-scsi.c
>> +++ b/hw/scsi/vhost-user-scsi.c
>> @@ -89,14 +89,126 @@ static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>> {
>> }
>> 
>> +static int vhost_user_scsi_connect(DeviceState *dev, Error **errp)
>> +{
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>> +    int ret = 0;
>> +
>> +    if (s->connected) {
>> +        return 0;
>> +    }
>> +    s->connected = true;
>> +
>> +    vsc->dev.num_queues = vs->conf.num_queues;
>> +    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>> +    vsc->dev.vqs = s->vhost_vqs;
>> +    vsc->dev.vq_index = 0;
>> +    vsc->dev.backend_features = 0;
>> +
>> +    ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
>> +                         errp);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    /* restore vhost state */
> 
> Should this use virtio_device_should_start like vhost_user_blk?
I will change this.
> 
>> +    if (virtio_device_started(vdev, vdev->status)) {
>> +        ret = vhost_scsi_common_start(vsc);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event);
>> +
>> +static void vhost_user_scsi_disconnect(DeviceState *dev)
>> +{
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>> +
> 
> I don’t think we want to execute vhost_scsi_common_stop() if the device hasn’t been started. I remember that caused a number of races with the vhost_user_blk connecting/disconnecting on startup.
> 
> Let’s add a similar started_vu check?
I will add it.
> 
>> +    if (!s->connected) {
>> +        return;
>> +    }
>> +    s->connected = false;
>> +
>> +    vhost_scsi_common_stop(vsc);
>> +
>> +    vhost_dev_cleanup(&vsc->dev);
>> +
>> +    /* Re-instate the event handler for new connections */
>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>> +                             vhost_user_scsi_event, NULL, dev, NULL, true);
>> +}
>> +
>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event)
>> +{
>> +    DeviceState *dev = opaque;
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>> +    Error *local_err = NULL;
>> +
>> +    switch (event) {
>> +    case CHR_EVENT_OPENED:
>> +        if (vhost_user_scsi_connect(dev, &local_err) < 0) {
>> +            error_report_err(local_err);
>> +            qemu_chr_fe_disconnect(&vs->conf.chardev);
>> +            return;
>> +        }
>> +        break;
>> +    case CHR_EVENT_CLOSED:
>> +        /* defer close until later to avoid circular close */
>> +        vhost_user_async_close(dev, &vs->conf.chardev, &vsc->dev,
>> +                               vhost_user_scsi_disconnect);
>> +        break;
>> +    case CHR_EVENT_BREAK:
>> +    case CHR_EVENT_MUX_IN:
>> +    case CHR_EVENT_MUX_OUT:
>> +        /* Ignore */
>> +        break;
>> +    }
>> +}
>> +
>> +static int vhost_user_scsi_realize_connect(VHostUserSCSI *s, Error **errp)
>> +{
>> +    DeviceState *dev = &s->parent_obj.parent_obj.parent_obj.parent_obj;
>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>> +    int ret;
>> +
>> +    s->connected = false;
>> +
>> +    ret = qemu_chr_fe_wait_connected(&vs->conf.chardev, errp);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = vhost_user_scsi_connect(dev, errp);
>> +    if (ret < 0) {
>> +        qemu_chr_fe_disconnect(&vs->conf.chardev);
>> +        return ret;
>> +    }
>> +    assert(s->connected);
>> +
>> +    return 0;
>> +}
>> +
>> static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>> {
>>    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>    VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>> -    struct vhost_virtqueue *vqs = NULL;
>>    Error *err = NULL;
>>    int ret;
>> +    int retries = REALIZE_CONNECTION_RETRIES;
>> 
>>    if (!vs->conf.chardev.chr) {
>>        error_setg(errp, "vhost-user-scsi: missing chardev");
>> @@ -112,21 +224,31 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>    }
>> 
>>    if (!vhost_user_init(&s->vhost_user, &vs->conf.chardev, errp)) {
> 
> Why execute vhost_user_cleanup() if vhost_user_init() fails?
OK, move this line up in v2.

> 
>> -        goto free_virtio;
>> +        goto free_vhost;
>>    }
>> 
>> -    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>> -    vsc->dev.vqs = g_new0(struct vhost_virtqueue, vsc->dev.nvqs);
>> -    vsc->dev.vq_index = 0;
>> -    vsc->dev.backend_features = 0;
>> -    vqs = vsc->dev.vqs;
>> +    vsc->inflight = g_new0(struct vhost_inflight, 1);
>> +    s->vhost_vqs = g_new0(struct vhost_virtqueue,
>> +                          VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues);
>> +
>> +    assert(!*errp);
>> +    do {
>> +        if (*errp) {
>> +            error_prepend(errp, "Reconnecting after error: ");
>> +            error_report_err(*errp);
>> +            *errp = NULL;
>> +        }
>> +        ret = vhost_user_scsi_realize_connect(s, errp);
>> +    } while (ret < 0 && retries--);
>> 
>> -    ret = vhost_dev_init(&vsc->dev, &s->vhost_user,
>> -                         VHOST_BACKEND_TYPE_USER, 0, errp);
>>    if (ret < 0) {
>> -        goto free_vhost;
>> +        goto free_vqs;
>>    }
>> 
>> +    /* we're fully initialized, now we can operate, so add the handler */
>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev,  NULL, NULL,
>> +                             vhost_user_scsi_event, NULL, (void *)dev,
>> +                             NULL, true);
>>    /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
>>    vsc->channel = 0;
>>    vsc->lun = 0;
>> @@ -134,10 +256,15 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>> 
>>    return;
>> 
>> +free_vqs:
>> +    g_free(s->vhost_vqs);
>> +    s->vhost_vqs = NULL;
>> +    g_free(vsc->inflight);
>> +    vsc->inflight = NULL;
>> +
>> free_vhost:
>>    vhost_user_cleanup(&s->vhost_user);
>> -    g_free(vqs);
>> -free_virtio:
>> +
>>    virtio_scsi_common_unrealize(dev);
>> }
>> 
>> @@ -146,16 +273,22 @@ static void vhost_user_scsi_unrealize(DeviceState *dev)
>>    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>    VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>> -    struct vhost_virtqueue *vqs = vsc->dev.vqs;
>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>> 
>>    /* This will stop the vhost backend. */
>>    vhost_user_scsi_set_status(vdev, 0);
>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL, NULL, NULL, NULL,
>> +                             NULL, false);
>> 
>>    vhost_dev_cleanup(&vsc->dev);
>> -    g_free(vqs);
> 
> Nit: Why not put vhost_dev_free_inflight next to the remaining inflight cleanup?
OK.
> 
>> +    vhost_dev_free_inflight(vsc->inflight);
>> +    g_free(s->vhost_vqs);
>> +    s->vhost_vqs = NULL;
>> +    g_free(vsc->inflight);
>> +    vsc->inflight = NULL;
>> 
> 
> Curiosity - why reorder here? Is something in vhost_user_cleanup() dependent on state freed in virtio_scsi_common_unrealize()?
> 
> If so, should that go as a standalone fix?

Because in vhost_user_scsi_realize, we initialize in order:
virtio_scsi_common_realize
vhost_user_init

And in the error handler of vhost_user_scsi_realize, the uninitialize in order:
vhost_user_cleanup
virtio_scsi_common_unrealize

I think in vhost_user_scsi_unrealize we should keep it the same order, right?

> 
>> -    virtio_scsi_common_unrealize(dev);
>>    vhost_user_cleanup(&s->vhost_user);
>> +    virtio_scsi_common_unrealize(dev);
>> }
>> 
>> static Property vhost_user_scsi_properties[] = {
>> diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
>> index 521b08e559..c66acc68b7 100644
>> --- a/include/hw/virtio/vhost-user-scsi.h
>> +++ b/include/hw/virtio/vhost-user-scsi.h
>> @@ -29,6 +29,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VHostUserSCSI, VHOST_USER_SCSI)
>> struct VHostUserSCSI {
>>    VHostSCSICommon parent_obj;
>>    VhostUserState vhost_user;
> 
> See above - we should probably have started_vu here/
I will add it.
> 
> Maybe we should have some shared struct with vhost_user_blk for connectivity params?

In the future vhost-user-blk/scsi can be refactored to share the same code.
> 
>> +    bool connected;
>> +
>> +    struct vhost_virtqueue *vhost_vqs;
>> };
>> 
>> #endif /* VHOST_USER_SCSI_H */
>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>> index 6a173cb9fa..b904346fe1 100644
>> --- a/include/hw/virtio/vhost.h
>> +++ b/include/hw/virtio/vhost.h
>> @@ -8,6 +8,8 @@
>> #define VHOST_F_DEVICE_IOTLB 63
>> #define VHOST_USER_F_PROTOCOL_FEATURES 30
>> 
> 
> Should the macro name indicate that this is for vhost-user?
> 
> VU_REALIZE_CONN_RETRIES? 
I will rename it in v2.

> 
>> +#define REALIZE_CONNECTION_RETRIES 3
>> +
>> /* Generic structures common for any vhost based device. */
>> 
>> struct vhost_inflight {
>> -- 
>> 2.41.0
Raphael Norwitz July 27, 2023, 9:21 p.m. UTC | #4
> On Jul 25, 2023, at 6:19 AM, Li Feng <fengli@smartx.com> wrote:
> 
> Thanks for your comments.
> 
>> 2023年7月25日 上午1:21,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
>> 
>> Very excited to see this. High level looks good modulo a few small things.
>> 
>> My major concern is around existing vhost-user-scsi backends which don’t support VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD. IMO we should hide the reconnect behavior behind a VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD check. We may want to do the same for vhost-user-blk.
>> 
>> The question is then what happens if the check is false. IIUC without an inflight FD, if a device processes requests out of order, it’s not safe to continue execution on reconnect, as there’s no way for the backend to know how to replay IO. Should we permanently wedge the device or have QEMU fail out? May be nice to have a toggle for this.
> 
> Based on what MST said, is there anything else I need to do?

I don’t think so.

>> 
>>> On Jul 21, 2023, at 6:51 AM, Li Feng <fengli@smartx.com> wrote:
>>> 
>>> If the backend crashes and restarts, the device is broken.
>>> This patch adds reconnect for vhost-user-scsi.
>>> 
>>> Tested with spdk backend.
>>> 
>>> Signed-off-by: Li Feng <fengli@smartx.com>
>>> ---
>>> hw/block/vhost-user-blk.c           |   2 -
>>> hw/scsi/vhost-scsi-common.c         |  27 ++---
>>> hw/scsi/vhost-user-scsi.c           | 163 +++++++++++++++++++++++++---
>>> include/hw/virtio/vhost-user-scsi.h |   3 +
>>> include/hw/virtio/vhost.h           |   2 +
>>> 5 files changed, 165 insertions(+), 32 deletions(-)
>>> 
>>> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>>> index eecf3f7a81..f250c740b5 100644
>>> --- a/hw/block/vhost-user-blk.c
>>> +++ b/hw/block/vhost-user-blk.c
>>> @@ -32,8 +32,6 @@
>>> #include "sysemu/sysemu.h"
>>> #include "sysemu/runstate.h"
>>> 
>>> -#define REALIZE_CONNECTION_RETRIES 3
>>> -
>>> static const int user_feature_bits[] = {
>>>    VIRTIO_BLK_F_SIZE_MAX,
>>>    VIRTIO_BLK_F_SEG_MAX,
>>> diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
>> 
>> Why can’t all the vhost-scsi-common stuff be moved to a separate change?
> 
> I will move this code to separate patch.
>> 
>> Especially the stuff introduced for vhost-user-blk in 1b0063b3048af65dfaae6422a572c87db8575a92 should be moved out.
> OK.
> 
>> 
>>> index a06f01af26..08801886b8 100644
>>> --- a/hw/scsi/vhost-scsi-common.c
>>> +++ b/hw/scsi/vhost-scsi-common.c
>>> @@ -52,16 +52,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>> 
>>>    vsc->dev.acked_features = vdev->guest_features;
>>> 
>>> -    assert(vsc->inflight == NULL);
>>> -    vsc->inflight = g_new0(struct vhost_inflight, 1);
>>> -    ret = vhost_dev_get_inflight(&vsc->dev,
>>> -                                 vs->conf.virtqueue_size,
>>> -                                 vsc->inflight);
>>> +    ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
>>>    if (ret < 0) {
>>> -        error_report("Error get inflight: %d", -ret);
>>> +        error_report("Error setting inflight format: %d", -ret);
>>>        goto err_guest_notifiers;
>>>    }
>>> 
>>> +    if (!vsc->inflight->addr) {
>>> +        ret = vhost_dev_get_inflight(&vsc->dev,
>>> +                                    vs->conf.virtqueue_size,
>>> +                                    vsc->inflight);
>>> +        if (ret < 0) {
>>> +            error_report("Error get inflight: %d", -ret);
>>> +            goto err_guest_notifiers;
>>> +        }
>>> +    }
>>> +
>>>    ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
>>>    if (ret < 0) {
>>>        error_report("Error set inflight: %d", -ret);
>>> @@ -85,9 +91,6 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>>    return ret;
>>> 
>>> err_guest_notifiers:
>>> -    g_free(vsc->inflight);
>>> -    vsc->inflight = NULL;
>>> -
>>>    k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false);
>>> err_host_notifiers:
>>>    vhost_dev_disable_notifiers(&vsc->dev, vdev);
>>> @@ -111,12 +114,6 @@ void vhost_scsi_common_stop(VHostSCSICommon *vsc)
>>>    }
>>>    assert(ret >= 0);
>>> 
>> 
>> In the vhost-scsi (kernel backend) path, what will cleanup vsc->inflight now?
> OK, we should check the vsc->inflight if it is null, the vhost-scsi doesn’t allocate the
> inflight object memory.

Are you saying vhost-scsi never allocates inflight so we don’t need to check for it?

> 
>> 
>>> -    if (vsc->inflight) {
>>> -        vhost_dev_free_inflight(vsc->inflight);
>>> -        g_free(vsc->inflight);
>>> -        vsc->inflight = NULL;
>>> -    }
>>> -
>>>    vhost_dev_disable_notifiers(&vsc->dev, vdev);
>>> }
>>> 
>>> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
>>> index ee99b19e7a..e0e88b0c42 100644
>>> --- a/hw/scsi/vhost-user-scsi.c
>>> +++ b/hw/scsi/vhost-user-scsi.c
>>> @@ -89,14 +89,126 @@ static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>>> {
>>> }
>>> 
>>> +static int vhost_user_scsi_connect(DeviceState *dev, Error **errp)
>>> +{
>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>> +    int ret = 0;
>>> +
>>> +    if (s->connected) {
>>> +        return 0;
>>> +    }
>>> +    s->connected = true;
>>> +
>>> +    vsc->dev.num_queues = vs->conf.num_queues;
>>> +    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>>> +    vsc->dev.vqs = s->vhost_vqs;
>>> +    vsc->dev.vq_index = 0;
>>> +    vsc->dev.backend_features = 0;
>>> +
>>> +    ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
>>> +                         errp);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    /* restore vhost state */
>> 
>> Should this use virtio_device_should_start like vhost_user_blk?
> I will change this.
>> 
>>> +    if (virtio_device_started(vdev, vdev->status)) {
>>> +        ret = vhost_scsi_common_start(vsc);
>>> +        if (ret < 0) {
>>> +            return ret;
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event);
>>> +
>>> +static void vhost_user_scsi_disconnect(DeviceState *dev)
>>> +{
>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>> +
>> 
>> I don’t think we want to execute vhost_scsi_common_stop() if the device hasn’t been started. I remember that caused a number of races with the vhost_user_blk connecting/disconnecting on startup.
>> 
>> Let’s add a similar started_vu check?
> I will add it.
>> 
>>> +    if (!s->connected) {
>>> +        return;
>>> +    }
>>> +    s->connected = false;
>>> +
>>> +    vhost_scsi_common_stop(vsc);
>>> +
>>> +    vhost_dev_cleanup(&vsc->dev);
>>> +
>>> +    /* Re-instate the event handler for new connections */
>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>>> +                             vhost_user_scsi_event, NULL, dev, NULL, true);
>>> +}
>>> +
>>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event)
>>> +{
>>> +    DeviceState *dev = opaque;
>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>> +    Error *local_err = NULL;
>>> +
>>> +    switch (event) {
>>> +    case CHR_EVENT_OPENED:
>>> +        if (vhost_user_scsi_connect(dev, &local_err) < 0) {
>>> +            error_report_err(local_err);
>>> +            qemu_chr_fe_disconnect(&vs->conf.chardev);
>>> +            return;
>>> +        }
>>> +        break;
>>> +    case CHR_EVENT_CLOSED:
>>> +        /* defer close until later to avoid circular close */
>>> +        vhost_user_async_close(dev, &vs->conf.chardev, &vsc->dev,
>>> +                               vhost_user_scsi_disconnect);
>>> +        break;
>>> +    case CHR_EVENT_BREAK:
>>> +    case CHR_EVENT_MUX_IN:
>>> +    case CHR_EVENT_MUX_OUT:
>>> +        /* Ignore */
>>> +        break;
>>> +    }
>>> +}
>>> +
>>> +static int vhost_user_scsi_realize_connect(VHostUserSCSI *s, Error **errp)
>>> +{
>>> +    DeviceState *dev = &s->parent_obj.parent_obj.parent_obj.parent_obj;
>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>> +    int ret;
>>> +
>>> +    s->connected = false;
>>> +
>>> +    ret = qemu_chr_fe_wait_connected(&vs->conf.chardev, errp);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    ret = vhost_user_scsi_connect(dev, errp);
>>> +    if (ret < 0) {
>>> +        qemu_chr_fe_disconnect(&vs->conf.chardev);
>>> +        return ret;
>>> +    }
>>> +    assert(s->connected);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>> {
>>>    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>    VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>>    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>> -    struct vhost_virtqueue *vqs = NULL;
>>>    Error *err = NULL;
>>>    int ret;
>>> +    int retries = REALIZE_CONNECTION_RETRIES;
>>> 
>>>    if (!vs->conf.chardev.chr) {
>>>        error_setg(errp, "vhost-user-scsi: missing chardev");
>>> @@ -112,21 +224,31 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>    }
>>> 
>>>    if (!vhost_user_init(&s->vhost_user, &vs->conf.chardev, errp)) {
>> 
>> Why execute vhost_user_cleanup() if vhost_user_init() fails?
> OK, move this line up in v2.
> 
>> 
>>> -        goto free_virtio;
>>> +        goto free_vhost;
>>>    }
>>> 
>>> -    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>>> -    vsc->dev.vqs = g_new0(struct vhost_virtqueue, vsc->dev.nvqs);
>>> -    vsc->dev.vq_index = 0;
>>> -    vsc->dev.backend_features = 0;
>>> -    vqs = vsc->dev.vqs;
>>> +    vsc->inflight = g_new0(struct vhost_inflight, 1);
>>> +    s->vhost_vqs = g_new0(struct vhost_virtqueue,
>>> +                          VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues);
>>> +
>>> +    assert(!*errp);
>>> +    do {
>>> +        if (*errp) {
>>> +            error_prepend(errp, "Reconnecting after error: ");
>>> +            error_report_err(*errp);
>>> +            *errp = NULL;
>>> +        }
>>> +        ret = vhost_user_scsi_realize_connect(s, errp);
>>> +    } while (ret < 0 && retries--);
>>> 
>>> -    ret = vhost_dev_init(&vsc->dev, &s->vhost_user,
>>> -                         VHOST_BACKEND_TYPE_USER, 0, errp);
>>>    if (ret < 0) {
>>> -        goto free_vhost;
>>> +        goto free_vqs;
>>>    }
>>> 
>>> +    /* we're fully initialized, now we can operate, so add the handler */
>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev,  NULL, NULL,
>>> +                             vhost_user_scsi_event, NULL, (void *)dev,
>>> +                             NULL, true);
>>>    /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
>>>    vsc->channel = 0;
>>>    vsc->lun = 0;
>>> @@ -134,10 +256,15 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>> 
>>>    return;
>>> 
>>> +free_vqs:
>>> +    g_free(s->vhost_vqs);
>>> +    s->vhost_vqs = NULL;
>>> +    g_free(vsc->inflight);
>>> +    vsc->inflight = NULL;
>>> +
>>> free_vhost:
>>>    vhost_user_cleanup(&s->vhost_user);
>>> -    g_free(vqs);
>>> -free_virtio:
>>> +
>>>    virtio_scsi_common_unrealize(dev);
>>> }
>>> 
>>> @@ -146,16 +273,22 @@ static void vhost_user_scsi_unrealize(DeviceState *dev)
>>>    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>    VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>>    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>> -    struct vhost_virtqueue *vqs = vsc->dev.vqs;
>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>> 
>>>    /* This will stop the vhost backend. */
>>>    vhost_user_scsi_set_status(vdev, 0);
>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL, NULL, NULL, NULL,
>>> +                             NULL, false);
>>> 
>>>    vhost_dev_cleanup(&vsc->dev);
>>> -    g_free(vqs);
>> 
>> Nit: Why not put vhost_dev_free_inflight next to the remaining inflight cleanup?
> OK.
>> 
>>> +    vhost_dev_free_inflight(vsc->inflight);
>>> +    g_free(s->vhost_vqs);
>>> +    s->vhost_vqs = NULL;
>>> +    g_free(vsc->inflight);
>>> +    vsc->inflight = NULL;
>>> 
>> 
>> Curiosity - why reorder here? Is something in vhost_user_cleanup() dependent on state freed in virtio_scsi_common_unrealize()?
>> 
>> If so, should that go as a standalone fix?
> 
> Because in vhost_user_scsi_realize, we initialize in order:
> virtio_scsi_common_realize
> vhost_user_init
> 
> And in the error handler of vhost_user_scsi_realize, the uninitialize in order:
> vhost_user_cleanup
> virtio_scsi_common_unrealize
> 
> I think in vhost_user_scsi_unrealize we should keep it the same order, right?

I’m not saying it’s wrong. If there’s no dependency (i.e. this is not fixing a bug, just a stylistic improvement) it can stay in the same change.

> 
>> 
>>> -    virtio_scsi_common_unrealize(dev);
>>>    vhost_user_cleanup(&s->vhost_user);
>>> +    virtio_scsi_common_unrealize(dev);
>>> }
>>> 
>>> static Property vhost_user_scsi_properties[] = {
>>> diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
>>> index 521b08e559..c66acc68b7 100644
>>> --- a/include/hw/virtio/vhost-user-scsi.h
>>> +++ b/include/hw/virtio/vhost-user-scsi.h
>>> @@ -29,6 +29,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VHostUserSCSI, VHOST_USER_SCSI)
>>> struct VHostUserSCSI {
>>>    VHostSCSICommon parent_obj;
>>>    VhostUserState vhost_user;
>> 
>> See above - we should probably have started_vu here/
> I will add it.
>> 
>> Maybe we should have some shared struct with vhost_user_blk for connectivity params?
> 
> In the future vhost-user-blk/scsi can be refactored to share the same code.

Sure - this can be done at some point in the future.

>> 
>>> +    bool connected;
>>> +
>>> +    struct vhost_virtqueue *vhost_vqs;
>>> };
>>> 
>>> #endif /* VHOST_USER_SCSI_H */
>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>> index 6a173cb9fa..b904346fe1 100644
>>> --- a/include/hw/virtio/vhost.h
>>> +++ b/include/hw/virtio/vhost.h
>>> @@ -8,6 +8,8 @@
>>> #define VHOST_F_DEVICE_IOTLB 63
>>> #define VHOST_USER_F_PROTOCOL_FEATURES 30
>>> 
>> 
>> Should the macro name indicate that this is for vhost-user?
>> 
>> VU_REALIZE_CONN_RETRIES? 
> I will rename it in v2.
> 
>> 
>>> +#define REALIZE_CONNECTION_RETRIES 3
>>> +
>>> /* Generic structures common for any vhost based device. */
>>> 
>>> struct vhost_inflight {
>>> -- 
>>> 2.41.0
Li Feng July 28, 2023, 7:48 a.m. UTC | #5
Thanks for your reply.

> 2023年7月28日 上午5:21,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
> 
> 
> 
>> On Jul 25, 2023, at 6:19 AM, Li Feng <fengli@smartx.com> wrote:
>> 
>> Thanks for your comments.
>> 
>>> 2023年7月25日 上午1:21,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
>>> 
>>> Very excited to see this. High level looks good modulo a few small things.
>>> 
>>> My major concern is around existing vhost-user-scsi backends which don’t support VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD. IMO we should hide the reconnect behavior behind a VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD check. We may want to do the same for vhost-user-blk.
>>> 
>>> The question is then what happens if the check is false. IIUC without an inflight FD, if a device processes requests out of order, it’s not safe to continue execution on reconnect, as there’s no way for the backend to know how to replay IO. Should we permanently wedge the device or have QEMU fail out? May be nice to have a toggle for this.
>> 
>> Based on what MST said, is there anything else I need to do?
> 
> I don’t think so.
> 
>>> 
>>>> On Jul 21, 2023, at 6:51 AM, Li Feng <fengli@smartx.com> wrote:
>>>> 
>>>> If the backend crashes and restarts, the device is broken.
>>>> This patch adds reconnect for vhost-user-scsi.
>>>> 
>>>> Tested with spdk backend.
>>>> 
>>>> Signed-off-by: Li Feng <fengli@smartx.com>
>>>> ---
>>>> hw/block/vhost-user-blk.c           |   2 -
>>>> hw/scsi/vhost-scsi-common.c         |  27 ++---
>>>> hw/scsi/vhost-user-scsi.c           | 163 +++++++++++++++++++++++++---
>>>> include/hw/virtio/vhost-user-scsi.h |   3 +
>>>> include/hw/virtio/vhost.h           |   2 +
>>>> 5 files changed, 165 insertions(+), 32 deletions(-)
>>>> 
>>>> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>>>> index eecf3f7a81..f250c740b5 100644
>>>> --- a/hw/block/vhost-user-blk.c
>>>> +++ b/hw/block/vhost-user-blk.c
>>>> @@ -32,8 +32,6 @@
>>>> #include "sysemu/sysemu.h"
>>>> #include "sysemu/runstate.h"
>>>> 
>>>> -#define REALIZE_CONNECTION_RETRIES 3
>>>> -
>>>> static const int user_feature_bits[] = {
>>>>   VIRTIO_BLK_F_SIZE_MAX,
>>>>   VIRTIO_BLK_F_SEG_MAX,
>>>> diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
>>> 
>>> Why can’t all the vhost-scsi-common stuff be moved to a separate change?
>> 
>> I will move this code to separate patch.
>>> 
>>> Especially the stuff introduced for vhost-user-blk in 1b0063b3048af65dfaae6422a572c87db8575a92 should be moved out.
>> OK.
>> 
>>> 
>>>> index a06f01af26..08801886b8 100644
>>>> --- a/hw/scsi/vhost-scsi-common.c
>>>> +++ b/hw/scsi/vhost-scsi-common.c
>>>> @@ -52,16 +52,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>>> 
>>>>   vsc->dev.acked_features = vdev->guest_features;
>>>> 
>>>> -    assert(vsc->inflight == NULL);
>>>> -    vsc->inflight = g_new0(struct vhost_inflight, 1);
>>>> -    ret = vhost_dev_get_inflight(&vsc->dev,
>>>> -                                 vs->conf.virtqueue_size,
>>>> -                                 vsc->inflight);
>>>> +    ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
>>>>   if (ret < 0) {
>>>> -        error_report("Error get inflight: %d", -ret);
>>>> +        error_report("Error setting inflight format: %d", -ret);
>>>>       goto err_guest_notifiers;
>>>>   }
>>>> 
>>>> +    if (!vsc->inflight->addr) {
>>>> +        ret = vhost_dev_get_inflight(&vsc->dev,
>>>> +                                    vs->conf.virtqueue_size,
>>>> +                                    vsc->inflight);
>>>> +        if (ret < 0) {
>>>> +            error_report("Error get inflight: %d", -ret);
>>>> +            goto err_guest_notifiers;
>>>> +        }
>>>> +    }
>>>> +
>>>>   ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
>>>>   if (ret < 0) {
>>>>       error_report("Error set inflight: %d", -ret);
>>>> @@ -85,9 +91,6 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>>>   return ret;
>>>> 
>>>> err_guest_notifiers:
>>>> -    g_free(vsc->inflight);
>>>> -    vsc->inflight = NULL;
>>>> -
>>>>   k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false);
>>>> err_host_notifiers:
>>>>   vhost_dev_disable_notifiers(&vsc->dev, vdev);
>>>> @@ -111,12 +114,6 @@ void vhost_scsi_common_stop(VHostSCSICommon *vsc)
>>>>   }
>>>>   assert(ret >= 0);
>>>> 
>>> 
>>> In the vhost-scsi (kernel backend) path, what will cleanup vsc->inflight now?
>> OK, we should check the vsc->inflight if it is null, the vhost-scsi doesn’t allocate the
>> inflight object memory.
> 
> Are you saying vhost-scsi never allocates inflight so we don’t need to check for it?
We have checked the vsc->inflight, and only if allocated, we send the get/set_inflight_fd.
This works with vhost-user-scsi/vhost-scsi both.
> 
>> 
>>> 
>>>> -    if (vsc->inflight) {
>>>> -        vhost_dev_free_inflight(vsc->inflight);
>>>> -        g_free(vsc->inflight);
>>>> -        vsc->inflight = NULL;
>>>> -    }
>>>> -
>>>>   vhost_dev_disable_notifiers(&vsc->dev, vdev);
>>>> }
>>>> 
>>>> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
>>>> index ee99b19e7a..e0e88b0c42 100644
>>>> --- a/hw/scsi/vhost-user-scsi.c
>>>> +++ b/hw/scsi/vhost-user-scsi.c
>>>> @@ -89,14 +89,126 @@ static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>>>> {
>>>> }
>>>> 
>>>> +static int vhost_user_scsi_connect(DeviceState *dev, Error **errp)
>>>> +{
>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>> +    int ret = 0;
>>>> +
>>>> +    if (s->connected) {
>>>> +        return 0;
>>>> +    }
>>>> +    s->connected = true;
>>>> +
>>>> +    vsc->dev.num_queues = vs->conf.num_queues;
>>>> +    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>>>> +    vsc->dev.vqs = s->vhost_vqs;
>>>> +    vsc->dev.vq_index = 0;
>>>> +    vsc->dev.backend_features = 0;
>>>> +
>>>> +    ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
>>>> +                         errp);
>>>> +    if (ret < 0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    /* restore vhost state */
>>> 
>>> Should this use virtio_device_should_start like vhost_user_blk?
>> I will change this.
>>> 
>>>> +    if (virtio_device_started(vdev, vdev->status)) {
>>>> +        ret = vhost_scsi_common_start(vsc);
>>>> +        if (ret < 0) {
>>>> +            return ret;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event);
>>>> +
>>>> +static void vhost_user_scsi_disconnect(DeviceState *dev)
>>>> +{
>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>> +
>>> 
>>> I don’t think we want to execute vhost_scsi_common_stop() if the device hasn’t been started. I remember that caused a number of races with the vhost_user_blk connecting/disconnecting on startup.
>>> 
>>> Let’s add a similar started_vu check?
>> I will add it.
>>> 
>>>> +    if (!s->connected) {
>>>> +        return;
>>>> +    }
>>>> +    s->connected = false;
>>>> +
>>>> +    vhost_scsi_common_stop(vsc);
>>>> +
>>>> +    vhost_dev_cleanup(&vsc->dev);
>>>> +
>>>> +    /* Re-instate the event handler for new connections */
>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>>>> +                             vhost_user_scsi_event, NULL, dev, NULL, true);
>>>> +}
>>>> +
>>>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event)
>>>> +{
>>>> +    DeviceState *dev = opaque;
>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>> +    Error *local_err = NULL;
>>>> +
>>>> +    switch (event) {
>>>> +    case CHR_EVENT_OPENED:
>>>> +        if (vhost_user_scsi_connect(dev, &local_err) < 0) {
>>>> +            error_report_err(local_err);
>>>> +            qemu_chr_fe_disconnect(&vs->conf.chardev);
>>>> +            return;
>>>> +        }
>>>> +        break;
>>>> +    case CHR_EVENT_CLOSED:
>>>> +        /* defer close until later to avoid circular close */
>>>> +        vhost_user_async_close(dev, &vs->conf.chardev, &vsc->dev,
>>>> +                               vhost_user_scsi_disconnect);
>>>> +        break;
>>>> +    case CHR_EVENT_BREAK:
>>>> +    case CHR_EVENT_MUX_IN:
>>>> +    case CHR_EVENT_MUX_OUT:
>>>> +        /* Ignore */
>>>> +        break;
>>>> +    }
>>>> +}
>>>> +
>>>> +static int vhost_user_scsi_realize_connect(VHostUserSCSI *s, Error **errp)
>>>> +{
>>>> +    DeviceState *dev = &s->parent_obj.parent_obj.parent_obj.parent_obj;
>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>> +    int ret;
>>>> +
>>>> +    s->connected = false;
>>>> +
>>>> +    ret = qemu_chr_fe_wait_connected(&vs->conf.chardev, errp);
>>>> +    if (ret < 0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = vhost_user_scsi_connect(dev, errp);
>>>> +    if (ret < 0) {
>>>> +        qemu_chr_fe_disconnect(&vs->conf.chardev);
>>>> +        return ret;
>>>> +    }
>>>> +    assert(s->connected);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>> {
>>>>   VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>   VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>>>   VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>> -    struct vhost_virtqueue *vqs = NULL;
>>>>   Error *err = NULL;
>>>>   int ret;
>>>> +    int retries = REALIZE_CONNECTION_RETRIES;
>>>> 
>>>>   if (!vs->conf.chardev.chr) {
>>>>       error_setg(errp, "vhost-user-scsi: missing chardev");
>>>> @@ -112,21 +224,31 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>>   }
>>>> 
>>>>   if (!vhost_user_init(&s->vhost_user, &vs->conf.chardev, errp)) {
>>> 
>>> Why execute vhost_user_cleanup() if vhost_user_init() fails?
>> OK, move this line up in v2.
>> 
>>> 
>>>> -        goto free_virtio;
>>>> +        goto free_vhost;
>>>>   }
>>>> 
>>>> -    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>>>> -    vsc->dev.vqs = g_new0(struct vhost_virtqueue, vsc->dev.nvqs);
>>>> -    vsc->dev.vq_index = 0;
>>>> -    vsc->dev.backend_features = 0;
>>>> -    vqs = vsc->dev.vqs;
>>>> +    vsc->inflight = g_new0(struct vhost_inflight, 1);
>>>> +    s->vhost_vqs = g_new0(struct vhost_virtqueue,
>>>> +                          VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues);
>>>> +
>>>> +    assert(!*errp);
>>>> +    do {
>>>> +        if (*errp) {
>>>> +            error_prepend(errp, "Reconnecting after error: ");
>>>> +            error_report_err(*errp);
>>>> +            *errp = NULL;
>>>> +        }
>>>> +        ret = vhost_user_scsi_realize_connect(s, errp);
>>>> +    } while (ret < 0 && retries--);
>>>> 
>>>> -    ret = vhost_dev_init(&vsc->dev, &s->vhost_user,
>>>> -                         VHOST_BACKEND_TYPE_USER, 0, errp);
>>>>   if (ret < 0) {
>>>> -        goto free_vhost;
>>>> +        goto free_vqs;
>>>>   }
>>>> 
>>>> +    /* we're fully initialized, now we can operate, so add the handler */
>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev,  NULL, NULL,
>>>> +                             vhost_user_scsi_event, NULL, (void *)dev,
>>>> +                             NULL, true);
>>>>   /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
>>>>   vsc->channel = 0;
>>>>   vsc->lun = 0;
>>>> @@ -134,10 +256,15 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>> 
>>>>   return;
>>>> 
>>>> +free_vqs:
>>>> +    g_free(s->vhost_vqs);
>>>> +    s->vhost_vqs = NULL;
>>>> +    g_free(vsc->inflight);
>>>> +    vsc->inflight = NULL;
>>>> +
>>>> free_vhost:
>>>>   vhost_user_cleanup(&s->vhost_user);
>>>> -    g_free(vqs);
>>>> -free_virtio:
>>>> +
>>>>   virtio_scsi_common_unrealize(dev);
>>>> }
>>>> 
>>>> @@ -146,16 +273,22 @@ static void vhost_user_scsi_unrealize(DeviceState *dev)
>>>>   VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>   VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>>>   VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>> -    struct vhost_virtqueue *vqs = vsc->dev.vqs;
>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>> 
>>>>   /* This will stop the vhost backend. */
>>>>   vhost_user_scsi_set_status(vdev, 0);
>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL, NULL, NULL, NULL,
>>>> +                             NULL, false);
>>>> 
>>>>   vhost_dev_cleanup(&vsc->dev);
>>>> -    g_free(vqs);
>>> 
>>> Nit: Why not put vhost_dev_free_inflight next to the remaining inflight cleanup?
>> OK.
>>> 
>>>> +    vhost_dev_free_inflight(vsc->inflight);
>>>> +    g_free(s->vhost_vqs);
>>>> +    s->vhost_vqs = NULL;
>>>> +    g_free(vsc->inflight);
>>>> +    vsc->inflight = NULL;
>>>> 
>>> 
>>> Curiosity - why reorder here? Is something in vhost_user_cleanup() dependent on state freed in virtio_scsi_common_unrealize()?
>>> 
>>> If so, should that go as a standalone fix?
>> 
>> Because in vhost_user_scsi_realize, we initialize in order:
>> virtio_scsi_common_realize
>> vhost_user_init
>> 
>> And in the error handler of vhost_user_scsi_realize, the uninitialize in order:
>> vhost_user_cleanup
>> virtio_scsi_common_unrealize
>> 
>> I think in vhost_user_scsi_unrealize we should keep it the same order, right?
> 
> I’m not saying it’s wrong. If there’s no dependency (i.e. this is not fixing a bug, just a stylistic improvement) it can stay in the same change.
OK.
> 
>> 
>>> 
>>>> -    virtio_scsi_common_unrealize(dev);
>>>>   vhost_user_cleanup(&s->vhost_user);
>>>> +    virtio_scsi_common_unrealize(dev);
>>>> }
>>>> 
>>>> static Property vhost_user_scsi_properties[] = {
>>>> diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
>>>> index 521b08e559..c66acc68b7 100644
>>>> --- a/include/hw/virtio/vhost-user-scsi.h
>>>> +++ b/include/hw/virtio/vhost-user-scsi.h
>>>> @@ -29,6 +29,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VHostUserSCSI, VHOST_USER_SCSI)
>>>> struct VHostUserSCSI {
>>>>   VHostSCSICommon parent_obj;
>>>>   VhostUserState vhost_user;
>>> 
>>> See above - we should probably have started_vu here/
>> I will add it.
>>> 
>>> Maybe we should have some shared struct with vhost_user_blk for connectivity params?
>> 
>> In the future vhost-user-blk/scsi can be refactored to share the same code.
> 
> Sure - this can be done at some point in the future.
> 
>>> 
>>>> +    bool connected;
>>>> +
>>>> +    struct vhost_virtqueue *vhost_vqs;
>>>> };
>>>> 
>>>> #endif /* VHOST_USER_SCSI_H */
>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>> index 6a173cb9fa..b904346fe1 100644
>>>> --- a/include/hw/virtio/vhost.h
>>>> +++ b/include/hw/virtio/vhost.h
>>>> @@ -8,6 +8,8 @@
>>>> #define VHOST_F_DEVICE_IOTLB 63
>>>> #define VHOST_USER_F_PROTOCOL_FEATURES 30
>>>> 
>>> 
>>> Should the macro name indicate that this is for vhost-user?
>>> 
>>> VU_REALIZE_CONN_RETRIES? 
>> I will rename it in v2.
>> 
>>> 
>>>> +#define REALIZE_CONNECTION_RETRIES 3
>>>> +
>>>> /* Generic structures common for any vhost based device. */
>>>> 
>>>> struct vhost_inflight {
>>>> -- 
>>>> 2.41.0

Any comments about other patches?
Raphael Norwitz July 30, 2023, 10:09 p.m. UTC | #6
> On Jul 28, 2023, at 3:48 AM, Li Feng <fengli@smartx.com> wrote:
> 
> Thanks for your reply.
> 
>> 2023年7月28日 上午5:21,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
>> 
>> 
>> 
>>> On Jul 25, 2023, at 6:19 AM, Li Feng <fengli@smartx.com> wrote:
>>> 
>>> Thanks for your comments.
>>> 
>>>> 2023年7月25日 上午1:21,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
>>>> 
>>>> Very excited to see this. High level looks good modulo a few small things.
>>>> 
>>>> My major concern is around existing vhost-user-scsi backends which don’t support VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD. IMO we should hide the reconnect behavior behind a VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD check. We may want to do the same for vhost-user-blk.
>>>> 
>>>> The question is then what happens if the check is false. IIUC without an inflight FD, if a device processes requests out of order, it’s not safe to continue execution on reconnect, as there’s no way for the backend to know how to replay IO. Should we permanently wedge the device or have QEMU fail out? May be nice to have a toggle for this.
>>> 
>>> Based on what MST said, is there anything else I need to do?
>> 
>> I don’t think so.
>> 
>>>> 
>>>>> On Jul 21, 2023, at 6:51 AM, Li Feng <fengli@smartx.com> wrote:
>>>>> 
>>>>> If the backend crashes and restarts, the device is broken.
>>>>> This patch adds reconnect for vhost-user-scsi.
>>>>> 
>>>>> Tested with spdk backend.
>>>>> 
>>>>> Signed-off-by: Li Feng <fengli@smartx.com>
>>>>> ---
>>>>> hw/block/vhost-user-blk.c           |   2 -
>>>>> hw/scsi/vhost-scsi-common.c         |  27 ++---
>>>>> hw/scsi/vhost-user-scsi.c           | 163 +++++++++++++++++++++++++---
>>>>> include/hw/virtio/vhost-user-scsi.h |   3 +
>>>>> include/hw/virtio/vhost.h           |   2 +
>>>>> 5 files changed, 165 insertions(+), 32 deletions(-)
>>>>> 
>>>>> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>>>>> index eecf3f7a81..f250c740b5 100644
>>>>> --- a/hw/block/vhost-user-blk.c
>>>>> +++ b/hw/block/vhost-user-blk.c
>>>>> @@ -32,8 +32,6 @@
>>>>> #include "sysemu/sysemu.h"
>>>>> #include "sysemu/runstate.h"
>>>>> 
>>>>> -#define REALIZE_CONNECTION_RETRIES 3
>>>>> -
>>>>> static const int user_feature_bits[] = {
>>>>>   VIRTIO_BLK_F_SIZE_MAX,
>>>>>   VIRTIO_BLK_F_SEG_MAX,
>>>>> diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
>>>> 
>>>> Why can’t all the vhost-scsi-common stuff be moved to a separate change?
>>> 
>>> I will move this code to separate patch.
>>>> 
>>>> Especially the stuff introduced for vhost-user-blk in 1b0063b3048af65dfaae6422a572c87db8575a92 should be moved out.
>>> OK.
>>> 
>>>> 
>>>>> index a06f01af26..08801886b8 100644
>>>>> --- a/hw/scsi/vhost-scsi-common.c
>>>>> +++ b/hw/scsi/vhost-scsi-common.c
>>>>> @@ -52,16 +52,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>>>> 
>>>>>   vsc->dev.acked_features = vdev->guest_features;
>>>>> 
>>>>> -    assert(vsc->inflight == NULL);
>>>>> -    vsc->inflight = g_new0(struct vhost_inflight, 1);
>>>>> -    ret = vhost_dev_get_inflight(&vsc->dev,
>>>>> -                                 vs->conf.virtqueue_size,
>>>>> -                                 vsc->inflight);
>>>>> +    ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
>>>>>   if (ret < 0) {
>>>>> -        error_report("Error get inflight: %d", -ret);
>>>>> +        error_report("Error setting inflight format: %d", -ret);
>>>>>       goto err_guest_notifiers;
>>>>>   }
>>>>> 
>>>>> +    if (!vsc->inflight->addr) {
>>>>> +        ret = vhost_dev_get_inflight(&vsc->dev,
>>>>> +                                    vs->conf.virtqueue_size,
>>>>> +                                    vsc->inflight);
>>>>> +        if (ret < 0) {
>>>>> +            error_report("Error get inflight: %d", -ret);
>>>>> +            goto err_guest_notifiers;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>>   ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
>>>>>   if (ret < 0) {
>>>>>       error_report("Error set inflight: %d", -ret);
>>>>> @@ -85,9 +91,6 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>>>>   return ret;
>>>>> 
>>>>> err_guest_notifiers:
>>>>> -    g_free(vsc->inflight);
>>>>> -    vsc->inflight = NULL;
>>>>> -
>>>>>   k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false);
>>>>> err_host_notifiers:
>>>>>   vhost_dev_disable_notifiers(&vsc->dev, vdev);
>>>>> @@ -111,12 +114,6 @@ void vhost_scsi_common_stop(VHostSCSICommon *vsc)
>>>>>   }
>>>>>   assert(ret >= 0);
>>>>> 
>>>> 
>>>> In the vhost-scsi (kernel backend) path, what will cleanup vsc->inflight now?
>>> OK, we should check the vsc->inflight if it is null, the vhost-scsi doesn’t allocate the
>>> inflight object memory.
>> 
>> Are you saying vhost-scsi never allocates inflight so we don’t need to check for it?
> We have checked the vsc->inflight, and only if allocated, we send the get/set_inflight_fd.
> This works with vhost-user-scsi/vhost-scsi both.

So then it sounds like this code introduces a resource leak. g_free(vsc->inflight) should be added to the vhost-scsi code in vhost_scsi_stop().

>> 
>>> 
>>>> 
>>>>> -    if (vsc->inflight) {
>>>>> -        vhost_dev_free_inflight(vsc->inflight);
>>>>> -        g_free(vsc->inflight);
>>>>> -        vsc->inflight = NULL;
>>>>> -    }
>>>>> -
>>>>>   vhost_dev_disable_notifiers(&vsc->dev, vdev);
>>>>> }
>>>>> 
>>>>> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
>>>>> index ee99b19e7a..e0e88b0c42 100644
>>>>> --- a/hw/scsi/vhost-user-scsi.c
>>>>> +++ b/hw/scsi/vhost-user-scsi.c
>>>>> @@ -89,14 +89,126 @@ static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>>>>> {
>>>>> }
>>>>> 
>>>>> +static int vhost_user_scsi_connect(DeviceState *dev, Error **errp)
>>>>> +{
>>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>> +    int ret = 0;
>>>>> +
>>>>> +    if (s->connected) {
>>>>> +        return 0;
>>>>> +    }
>>>>> +    s->connected = true;
>>>>> +
>>>>> +    vsc->dev.num_queues = vs->conf.num_queues;
>>>>> +    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>>>>> +    vsc->dev.vqs = s->vhost_vqs;
>>>>> +    vsc->dev.vq_index = 0;
>>>>> +    vsc->dev.backend_features = 0;
>>>>> +
>>>>> +    ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
>>>>> +                         errp);
>>>>> +    if (ret < 0) {
>>>>> +        return ret;
>>>>> +    }
>>>>> +
>>>>> +    /* restore vhost state */
>>>> 
>>>> Should this use virtio_device_should_start like vhost_user_blk?
>>> I will change this.
>>>> 
>>>>> +    if (virtio_device_started(vdev, vdev->status)) {
>>>>> +        ret = vhost_scsi_common_start(vsc);
>>>>> +        if (ret < 0) {
>>>>> +            return ret;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event);
>>>>> +
>>>>> +static void vhost_user_scsi_disconnect(DeviceState *dev)
>>>>> +{
>>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>> +
>>>> 
>>>> I don’t think we want to execute vhost_scsi_common_stop() if the device hasn’t been started. I remember that caused a number of races with the vhost_user_blk connecting/disconnecting on startup.
>>>> 
>>>> Let’s add a similar started_vu check?
>>> I will add it.
>>>> 
>>>>> +    if (!s->connected) {
>>>>> +        return;
>>>>> +    }
>>>>> +    s->connected = false;
>>>>> +
>>>>> +    vhost_scsi_common_stop(vsc);
>>>>> +
>>>>> +    vhost_dev_cleanup(&vsc->dev);
>>>>> +
>>>>> +    /* Re-instate the event handler for new connections */
>>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>>>>> +                             vhost_user_scsi_event, NULL, dev, NULL, true);
>>>>> +}
>>>>> +
>>>>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event)
>>>>> +{
>>>>> +    DeviceState *dev = opaque;
>>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>> +    Error *local_err = NULL;
>>>>> +
>>>>> +    switch (event) {
>>>>> +    case CHR_EVENT_OPENED:
>>>>> +        if (vhost_user_scsi_connect(dev, &local_err) < 0) {
>>>>> +            error_report_err(local_err);
>>>>> +            qemu_chr_fe_disconnect(&vs->conf.chardev);
>>>>> +            return;
>>>>> +        }
>>>>> +        break;
>>>>> +    case CHR_EVENT_CLOSED:
>>>>> +        /* defer close until later to avoid circular close */
>>>>> +        vhost_user_async_close(dev, &vs->conf.chardev, &vsc->dev,
>>>>> +                               vhost_user_scsi_disconnect);
>>>>> +        break;
>>>>> +    case CHR_EVENT_BREAK:
>>>>> +    case CHR_EVENT_MUX_IN:
>>>>> +    case CHR_EVENT_MUX_OUT:
>>>>> +        /* Ignore */
>>>>> +        break;
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +static int vhost_user_scsi_realize_connect(VHostUserSCSI *s, Error **errp)
>>>>> +{
>>>>> +    DeviceState *dev = &s->parent_obj.parent_obj.parent_obj.parent_obj;
>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>> +    int ret;
>>>>> +
>>>>> +    s->connected = false;
>>>>> +
>>>>> +    ret = qemu_chr_fe_wait_connected(&vs->conf.chardev, errp);
>>>>> +    if (ret < 0) {
>>>>> +        return ret;
>>>>> +    }
>>>>> +
>>>>> +    ret = vhost_user_scsi_connect(dev, errp);
>>>>> +    if (ret < 0) {
>>>>> +        qemu_chr_fe_disconnect(&vs->conf.chardev);
>>>>> +        return ret;
>>>>> +    }
>>>>> +    assert(s->connected);
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>>> {
>>>>>   VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>>   VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>>>>   VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>> -    struct vhost_virtqueue *vqs = NULL;
>>>>>   Error *err = NULL;
>>>>>   int ret;
>>>>> +    int retries = REALIZE_CONNECTION_RETRIES;
>>>>> 
>>>>>   if (!vs->conf.chardev.chr) {
>>>>>       error_setg(errp, "vhost-user-scsi: missing chardev");
>>>>> @@ -112,21 +224,31 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>>>   }
>>>>> 
>>>>>   if (!vhost_user_init(&s->vhost_user, &vs->conf.chardev, errp)) {
>>>> 
>>>> Why execute vhost_user_cleanup() if vhost_user_init() fails?
>>> OK, move this line up in v2.
>>> 
>>>> 
>>>>> -        goto free_virtio;
>>>>> +        goto free_vhost;
>>>>>   }
>>>>> 
>>>>> -    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>>>>> -    vsc->dev.vqs = g_new0(struct vhost_virtqueue, vsc->dev.nvqs);
>>>>> -    vsc->dev.vq_index = 0;
>>>>> -    vsc->dev.backend_features = 0;
>>>>> -    vqs = vsc->dev.vqs;
>>>>> +    vsc->inflight = g_new0(struct vhost_inflight, 1);
>>>>> +    s->vhost_vqs = g_new0(struct vhost_virtqueue,
>>>>> +                          VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues);
>>>>> +
>>>>> +    assert(!*errp);
>>>>> +    do {
>>>>> +        if (*errp) {
>>>>> +            error_prepend(errp, "Reconnecting after error: ");
>>>>> +            error_report_err(*errp);
>>>>> +            *errp = NULL;
>>>>> +        }
>>>>> +        ret = vhost_user_scsi_realize_connect(s, errp);
>>>>> +    } while (ret < 0 && retries--);
>>>>> 
>>>>> -    ret = vhost_dev_init(&vsc->dev, &s->vhost_user,
>>>>> -                         VHOST_BACKEND_TYPE_USER, 0, errp);
>>>>>   if (ret < 0) {
>>>>> -        goto free_vhost;
>>>>> +        goto free_vqs;
>>>>>   }
>>>>> 
>>>>> +    /* we're fully initialized, now we can operate, so add the handler */
>>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev,  NULL, NULL,
>>>>> +                             vhost_user_scsi_event, NULL, (void *)dev,
>>>>> +                             NULL, true);
>>>>>   /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
>>>>>   vsc->channel = 0;
>>>>>   vsc->lun = 0;
>>>>> @@ -134,10 +256,15 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>>> 
>>>>>   return;
>>>>> 
>>>>> +free_vqs:
>>>>> +    g_free(s->vhost_vqs);
>>>>> +    s->vhost_vqs = NULL;
>>>>> +    g_free(vsc->inflight);
>>>>> +    vsc->inflight = NULL;
>>>>> +
>>>>> free_vhost:
>>>>>   vhost_user_cleanup(&s->vhost_user);
>>>>> -    g_free(vqs);
>>>>> -free_virtio:
>>>>> +
>>>>>   virtio_scsi_common_unrealize(dev);
>>>>> }
>>>>> 
>>>>> @@ -146,16 +273,22 @@ static void vhost_user_scsi_unrealize(DeviceState *dev)
>>>>>   VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>>   VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>>>>   VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>> -    struct vhost_virtqueue *vqs = vsc->dev.vqs;
>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>> 
>>>>>   /* This will stop the vhost backend. */
>>>>>   vhost_user_scsi_set_status(vdev, 0);
>>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL, NULL, NULL, NULL,
>>>>> +                             NULL, false);
>>>>> 
>>>>>   vhost_dev_cleanup(&vsc->dev);
>>>>> -    g_free(vqs);
>>>> 
>>>> Nit: Why not put vhost_dev_free_inflight next to the remaining inflight cleanup?
>>> OK.
>>>> 
>>>>> +    vhost_dev_free_inflight(vsc->inflight);
>>>>> +    g_free(s->vhost_vqs);
>>>>> +    s->vhost_vqs = NULL;
>>>>> +    g_free(vsc->inflight);
>>>>> +    vsc->inflight = NULL;
>>>>> 
>>>> 
>>>> Curiosity - why reorder here? Is something in vhost_user_cleanup() dependent on state freed in virtio_scsi_common_unrealize()?
>>>> 
>>>> If so, should that go as a standalone fix?
>>> 
>>> Because in vhost_user_scsi_realize, we initialize in order:
>>> virtio_scsi_common_realize
>>> vhost_user_init
>>> 
>>> And in the error handler of vhost_user_scsi_realize, the uninitialize in order:
>>> vhost_user_cleanup
>>> virtio_scsi_common_unrealize
>>> 
>>> I think in vhost_user_scsi_unrealize we should keep it the same order, right?
>> 
>> I’m not saying it’s wrong. If there’s no dependency (i.e. this is not fixing a bug, just a stylistic improvement) it can stay in the same change.
> OK.
>> 
>>> 
>>>> 
>>>>> -    virtio_scsi_common_unrealize(dev);
>>>>>   vhost_user_cleanup(&s->vhost_user);
>>>>> +    virtio_scsi_common_unrealize(dev);
>>>>> }
>>>>> 
>>>>> static Property vhost_user_scsi_properties[] = {
>>>>> diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
>>>>> index 521b08e559..c66acc68b7 100644
>>>>> --- a/include/hw/virtio/vhost-user-scsi.h
>>>>> +++ b/include/hw/virtio/vhost-user-scsi.h
>>>>> @@ -29,6 +29,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VHostUserSCSI, VHOST_USER_SCSI)
>>>>> struct VHostUserSCSI {
>>>>>   VHostSCSICommon parent_obj;
>>>>>   VhostUserState vhost_user;
>>>> 
>>>> See above - we should probably have started_vu here/
>>> I will add it.
>>>> 
>>>> Maybe we should have some shared struct with vhost_user_blk for connectivity params?
>>> 
>>> In the future vhost-user-blk/scsi can be refactored to share the same code.
>> 
>> Sure - this can be done at some point in the future.
>> 
>>>> 
>>>>> +    bool connected;
>>>>> +
>>>>> +    struct vhost_virtqueue *vhost_vqs;
>>>>> };
>>>>> 
>>>>> #endif /* VHOST_USER_SCSI_H */
>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>>> index 6a173cb9fa..b904346fe1 100644
>>>>> --- a/include/hw/virtio/vhost.h
>>>>> +++ b/include/hw/virtio/vhost.h
>>>>> @@ -8,6 +8,8 @@
>>>>> #define VHOST_F_DEVICE_IOTLB 63
>>>>> #define VHOST_USER_F_PROTOCOL_FEATURES 30
>>>>> 
>>>> 
>>>> Should the macro name indicate that this is for vhost-user?
>>>> 
>>>> VU_REALIZE_CONN_RETRIES? 
>>> I will rename it in v2.
>>> 
>>>> 
>>>>> +#define REALIZE_CONNECTION_RETRIES 3
>>>>> +
>>>>> /* Generic structures common for any vhost based device. */
>>>>> 
>>>>> struct vhost_inflight {
>>>>> -- 
>>>>> 2.41.0
> 
> Any comments about other patches?

I’ll send shortly.
Li Feng July 31, 2023, 11:32 a.m. UTC | #7
> 2023年7月31日 06:09,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
> 
> 
> 
>> On Jul 28, 2023, at 3:48 AM, Li Feng <fengli@smartx.com> wrote:
>> 
>> Thanks for your reply.
>> 
>>> 2023年7月28日 上午5:21,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
>>> 
>>> 
>>> 
>>>> On Jul 25, 2023, at 6:19 AM, Li Feng <fengli@smartx.com> wrote:
>>>> 
>>>> Thanks for your comments.
>>>> 
>>>>> 2023年7月25日 上午1:21,Raphael Norwitz <raphael.norwitz@nutanix.com> 写道:
>>>>> 
>>>>> Very excited to see this. High level looks good modulo a few small things.
>>>>> 
>>>>> My major concern is around existing vhost-user-scsi backends which don’t support VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD. IMO we should hide the reconnect behavior behind a VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD check. We may want to do the same for vhost-user-blk.
>>>>> 
>>>>> The question is then what happens if the check is false. IIUC without an inflight FD, if a device processes requests out of order, it’s not safe to continue execution on reconnect, as there’s no way for the backend to know how to replay IO. Should we permanently wedge the device or have QEMU fail out? May be nice to have a toggle for this.
>>>> 
>>>> Based on what MST said, is there anything else I need to do?
>>> 
>>> I don’t think so.
>>> 
>>>>> 
>>>>>> On Jul 21, 2023, at 6:51 AM, Li Feng <fengli@smartx.com> wrote:
>>>>>> 
>>>>>> If the backend crashes and restarts, the device is broken.
>>>>>> This patch adds reconnect for vhost-user-scsi.
>>>>>> 
>>>>>> Tested with spdk backend.
>>>>>> 
>>>>>> Signed-off-by: Li Feng <fengli@smartx.com>
>>>>>> ---
>>>>>> hw/block/vhost-user-blk.c           |   2 -
>>>>>> hw/scsi/vhost-scsi-common.c         |  27 ++---
>>>>>> hw/scsi/vhost-user-scsi.c           | 163 +++++++++++++++++++++++++---
>>>>>> include/hw/virtio/vhost-user-scsi.h |   3 +
>>>>>> include/hw/virtio/vhost.h           |   2 +
>>>>>> 5 files changed, 165 insertions(+), 32 deletions(-)
>>>>>> 
>>>>>> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>>>>>> index eecf3f7a81..f250c740b5 100644
>>>>>> --- a/hw/block/vhost-user-blk.c
>>>>>> +++ b/hw/block/vhost-user-blk.c
>>>>>> @@ -32,8 +32,6 @@
>>>>>> #include "sysemu/sysemu.h"
>>>>>> #include "sysemu/runstate.h"
>>>>>> 
>>>>>> -#define REALIZE_CONNECTION_RETRIES 3
>>>>>> -
>>>>>> static const int user_feature_bits[] = {
>>>>>>  VIRTIO_BLK_F_SIZE_MAX,
>>>>>>  VIRTIO_BLK_F_SEG_MAX,
>>>>>> diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
>>>>> 
>>>>> Why can’t all the vhost-scsi-common stuff be moved to a separate change?
>>>> 
>>>> I will move this code to separate patch.
>>>>> 
>>>>> Especially the stuff introduced for vhost-user-blk in 1b0063b3048af65dfaae6422a572c87db8575a92 should be moved out.
>>>> OK.
>>>> 
>>>>> 
>>>>>> index a06f01af26..08801886b8 100644
>>>>>> --- a/hw/scsi/vhost-scsi-common.c
>>>>>> +++ b/hw/scsi/vhost-scsi-common.c
>>>>>> @@ -52,16 +52,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>>>>> 
>>>>>>  vsc->dev.acked_features = vdev->guest_features;
>>>>>> 
>>>>>> -    assert(vsc->inflight == NULL);
>>>>>> -    vsc->inflight = g_new0(struct vhost_inflight, 1);
>>>>>> -    ret = vhost_dev_get_inflight(&vsc->dev,
>>>>>> -                                 vs->conf.virtqueue_size,
>>>>>> -                                 vsc->inflight);
>>>>>> +    ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
>>>>>>  if (ret < 0) {
>>>>>> -        error_report("Error get inflight: %d", -ret);
>>>>>> +        error_report("Error setting inflight format: %d", -ret);
>>>>>>      goto err_guest_notifiers;
>>>>>>  }
>>>>>> 
>>>>>> +    if (!vsc->inflight->addr) {
>>>>>> +        ret = vhost_dev_get_inflight(&vsc->dev,
>>>>>> +                                    vs->conf.virtqueue_size,
>>>>>> +                                    vsc->inflight);
>>>>>> +        if (ret < 0) {
>>>>>> +            error_report("Error get inflight: %d", -ret);
>>>>>> +            goto err_guest_notifiers;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>>  ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
>>>>>>  if (ret < 0) {
>>>>>>      error_report("Error set inflight: %d", -ret);
>>>>>> @@ -85,9 +91,6 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
>>>>>>  return ret;
>>>>>> 
>>>>>> err_guest_notifiers:
>>>>>> -    g_free(vsc->inflight);
>>>>>> -    vsc->inflight = NULL;
>>>>>> -
>>>>>>  k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false);
>>>>>> err_host_notifiers:
>>>>>>  vhost_dev_disable_notifiers(&vsc->dev, vdev);
>>>>>> @@ -111,12 +114,6 @@ void vhost_scsi_common_stop(VHostSCSICommon *vsc)
>>>>>>  }
>>>>>>  assert(ret >= 0);
>>>>>> 
>>>>> 
>>>>> In the vhost-scsi (kernel backend) path, what will cleanup vsc->inflight now?
>>>> OK, we should check the vsc->inflight if it is null, the vhost-scsi doesn’t allocate the
>>>> inflight object memory.
>>> 
>>> Are you saying vhost-scsi never allocates inflight so we don’t need to check for it?
>> We have checked the vsc->inflight, and only if allocated, we send the get/set_inflight_fd.
>> This works with vhost-user-scsi/vhost-scsi both.
> 
> So then it sounds like this code introduces a resource leak. g_free(vsc->inflight) should be added to the vhost-scsi code in vhost_scsi_stop().

No, the vhost-scsi doesn’t need ‘inflight', it doesn’t allocate the inflight memory.

The rule is ‘who allocates, who free it’.

> 
>>> 
>>>> 
>>>>> 
>>>>>> -    if (vsc->inflight) {
>>>>>> -        vhost_dev_free_inflight(vsc->inflight);
>>>>>> -        g_free(vsc->inflight);
>>>>>> -        vsc->inflight = NULL;
>>>>>> -    }
>>>>>> -
>>>>>>  vhost_dev_disable_notifiers(&vsc->dev, vdev);
>>>>>> }
>>>>>> 
>>>>>> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
>>>>>> index ee99b19e7a..e0e88b0c42 100644
>>>>>> --- a/hw/scsi/vhost-user-scsi.c
>>>>>> +++ b/hw/scsi/vhost-user-scsi.c
>>>>>> @@ -89,14 +89,126 @@ static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>>>>>> {
>>>>>> }
>>>>>> 
>>>>>> +static int vhost_user_scsi_connect(DeviceState *dev, Error **errp)
>>>>>> +{
>>>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>>> +    int ret = 0;
>>>>>> +
>>>>>> +    if (s->connected) {
>>>>>> +        return 0;
>>>>>> +    }
>>>>>> +    s->connected = true;
>>>>>> +
>>>>>> +    vsc->dev.num_queues = vs->conf.num_queues;
>>>>>> +    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>>>>>> +    vsc->dev.vqs = s->vhost_vqs;
>>>>>> +    vsc->dev.vq_index = 0;
>>>>>> +    vsc->dev.backend_features = 0;
>>>>>> +
>>>>>> +    ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
>>>>>> +                         errp);
>>>>>> +    if (ret < 0) {
>>>>>> +        return ret;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* restore vhost state */
>>>>> 
>>>>> Should this use virtio_device_should_start like vhost_user_blk?
>>>> I will change this.
>>>>> 
>>>>>> +    if (virtio_device_started(vdev, vdev->status)) {
>>>>>> +        ret = vhost_scsi_common_start(vsc);
>>>>>> +        if (ret < 0) {
>>>>>> +            return ret;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event);
>>>>>> +
>>>>>> +static void vhost_user_scsi_disconnect(DeviceState *dev)
>>>>>> +{
>>>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>>> +
>>>>> 
>>>>> I don’t think we want to execute vhost_scsi_common_stop() if the device hasn’t been started. I remember that caused a number of races with the vhost_user_blk connecting/disconnecting on startup.
>>>>> 
>>>>> Let’s add a similar started_vu check?
>>>> I will add it.
>>>>> 
>>>>>> +    if (!s->connected) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +    s->connected = false;
>>>>>> +
>>>>>> +    vhost_scsi_common_stop(vsc);
>>>>>> +
>>>>>> +    vhost_dev_cleanup(&vsc->dev);
>>>>>> +
>>>>>> +    /* Re-instate the event handler for new connections */
>>>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>>>>>> +                             vhost_user_scsi_event, NULL, dev, NULL, true);
>>>>>> +}
>>>>>> +
>>>>>> +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event)
>>>>>> +{
>>>>>> +    DeviceState *dev = opaque;
>>>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>>> +    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
>>>>>> +    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>>> +    Error *local_err = NULL;
>>>>>> +
>>>>>> +    switch (event) {
>>>>>> +    case CHR_EVENT_OPENED:
>>>>>> +        if (vhost_user_scsi_connect(dev, &local_err) < 0) {
>>>>>> +            error_report_err(local_err);
>>>>>> +            qemu_chr_fe_disconnect(&vs->conf.chardev);
>>>>>> +            return;
>>>>>> +        }
>>>>>> +        break;
>>>>>> +    case CHR_EVENT_CLOSED:
>>>>>> +        /* defer close until later to avoid circular close */
>>>>>> +        vhost_user_async_close(dev, &vs->conf.chardev, &vsc->dev,
>>>>>> +                               vhost_user_scsi_disconnect);
>>>>>> +        break;
>>>>>> +    case CHR_EVENT_BREAK:
>>>>>> +    case CHR_EVENT_MUX_IN:
>>>>>> +    case CHR_EVENT_MUX_OUT:
>>>>>> +        /* Ignore */
>>>>>> +        break;
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +static int vhost_user_scsi_realize_connect(VHostUserSCSI *s, Error **errp)
>>>>>> +{
>>>>>> +    DeviceState *dev = &s->parent_obj.parent_obj.parent_obj.parent_obj;
>>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    s->connected = false;
>>>>>> +
>>>>>> +    ret = qemu_chr_fe_wait_connected(&vs->conf.chardev, errp);
>>>>>> +    if (ret < 0) {
>>>>>> +        return ret;
>>>>>> +    }
>>>>>> +
>>>>>> +    ret = vhost_user_scsi_connect(dev, errp);
>>>>>> +    if (ret < 0) {
>>>>>> +        qemu_chr_fe_disconnect(&vs->conf.chardev);
>>>>>> +        return ret;
>>>>>> +    }
>>>>>> +    assert(s->connected);
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>>>> {
>>>>>>  VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>>>  VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>>>>>  VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>>> -    struct vhost_virtqueue *vqs = NULL;
>>>>>>  Error *err = NULL;
>>>>>>  int ret;
>>>>>> +    int retries = REALIZE_CONNECTION_RETRIES;
>>>>>> 
>>>>>>  if (!vs->conf.chardev.chr) {
>>>>>>      error_setg(errp, "vhost-user-scsi: missing chardev");
>>>>>> @@ -112,21 +224,31 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>>>>  }
>>>>>> 
>>>>>>  if (!vhost_user_init(&s->vhost_user, &vs->conf.chardev, errp)) {
>>>>> 
>>>>> Why execute vhost_user_cleanup() if vhost_user_init() fails?
>>>> OK, move this line up in v2.
>>>> 
>>>>> 
>>>>>> -        goto free_virtio;
>>>>>> +        goto free_vhost;
>>>>>>  }
>>>>>> 
>>>>>> -    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
>>>>>> -    vsc->dev.vqs = g_new0(struct vhost_virtqueue, vsc->dev.nvqs);
>>>>>> -    vsc->dev.vq_index = 0;
>>>>>> -    vsc->dev.backend_features = 0;
>>>>>> -    vqs = vsc->dev.vqs;
>>>>>> +    vsc->inflight = g_new0(struct vhost_inflight, 1);
>>>>>> +    s->vhost_vqs = g_new0(struct vhost_virtqueue,
>>>>>> +                          VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues);
>>>>>> +
>>>>>> +    assert(!*errp);
>>>>>> +    do {
>>>>>> +        if (*errp) {
>>>>>> +            error_prepend(errp, "Reconnecting after error: ");
>>>>>> +            error_report_err(*errp);
>>>>>> +            *errp = NULL;
>>>>>> +        }
>>>>>> +        ret = vhost_user_scsi_realize_connect(s, errp);
>>>>>> +    } while (ret < 0 && retries--);
>>>>>> 
>>>>>> -    ret = vhost_dev_init(&vsc->dev, &s->vhost_user,
>>>>>> -                         VHOST_BACKEND_TYPE_USER, 0, errp);
>>>>>>  if (ret < 0) {
>>>>>> -        goto free_vhost;
>>>>>> +        goto free_vqs;
>>>>>>  }
>>>>>> 
>>>>>> +    /* we're fully initialized, now we can operate, so add the handler */
>>>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev,  NULL, NULL,
>>>>>> +                             vhost_user_scsi_event, NULL, (void *)dev,
>>>>>> +                             NULL, true);
>>>>>>  /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
>>>>>>  vsc->channel = 0;
>>>>>>  vsc->lun = 0;
>>>>>> @@ -134,10 +256,15 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
>>>>>> 
>>>>>>  return;
>>>>>> 
>>>>>> +free_vqs:
>>>>>> +    g_free(s->vhost_vqs);
>>>>>> +    s->vhost_vqs = NULL;
>>>>>> +    g_free(vsc->inflight);
>>>>>> +    vsc->inflight = NULL;
>>>>>> +
>>>>>> free_vhost:
>>>>>>  vhost_user_cleanup(&s->vhost_user);
>>>>>> -    g_free(vqs);
>>>>>> -free_virtio:
>>>>>> +
>>>>>>  virtio_scsi_common_unrealize(dev);
>>>>>> }
>>>>>> 
>>>>>> @@ -146,16 +273,22 @@ static void vhost_user_scsi_unrealize(DeviceState *dev)
>>>>>>  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>>>>>>  VHostUserSCSI *s = VHOST_USER_SCSI(dev);
>>>>>>  VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
>>>>>> -    struct vhost_virtqueue *vqs = vsc->dev.vqs;
>>>>>> +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>>>> 
>>>>>>  /* This will stop the vhost backend. */
>>>>>>  vhost_user_scsi_set_status(vdev, 0);
>>>>>> +    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL, NULL, NULL, NULL,
>>>>>> +                             NULL, false);
>>>>>> 
>>>>>>  vhost_dev_cleanup(&vsc->dev);
>>>>>> -    g_free(vqs);
>>>>> 
>>>>> Nit: Why not put vhost_dev_free_inflight next to the remaining inflight cleanup?
>>>> OK.
>>>>> 
>>>>>> +    vhost_dev_free_inflight(vsc->inflight);
>>>>>> +    g_free(s->vhost_vqs);
>>>>>> +    s->vhost_vqs = NULL;
>>>>>> +    g_free(vsc->inflight);
>>>>>> +    vsc->inflight = NULL;
>>>>>> 
>>>>> 
>>>>> Curiosity - why reorder here? Is something in vhost_user_cleanup() dependent on state freed in virtio_scsi_common_unrealize()?
>>>>> 
>>>>> If so, should that go as a standalone fix?
>>>> 
>>>> Because in vhost_user_scsi_realize, we initialize in order:
>>>> virtio_scsi_common_realize
>>>> vhost_user_init
>>>> 
>>>> And in the error handler of vhost_user_scsi_realize, the uninitialize in order:
>>>> vhost_user_cleanup
>>>> virtio_scsi_common_unrealize
>>>> 
>>>> I think in vhost_user_scsi_unrealize we should keep it the same order, right?
>>> 
>>> I’m not saying it’s wrong. If there’s no dependency (i.e. this is not fixing a bug, just a stylistic improvement) it can stay in the same change.
>> OK.
>>> 
>>>> 
>>>>> 
>>>>>> -    virtio_scsi_common_unrealize(dev);
>>>>>>  vhost_user_cleanup(&s->vhost_user);
>>>>>> +    virtio_scsi_common_unrealize(dev);
>>>>>> }
>>>>>> 
>>>>>> static Property vhost_user_scsi_properties[] = {
>>>>>> diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
>>>>>> index 521b08e559..c66acc68b7 100644
>>>>>> --- a/include/hw/virtio/vhost-user-scsi.h
>>>>>> +++ b/include/hw/virtio/vhost-user-scsi.h
>>>>>> @@ -29,6 +29,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VHostUserSCSI, VHOST_USER_SCSI)
>>>>>> struct VHostUserSCSI {
>>>>>>  VHostSCSICommon parent_obj;
>>>>>>  VhostUserState vhost_user;
>>>>> 
>>>>> See above - we should probably have started_vu here/
>>>> I will add it.
>>>>> 
>>>>> Maybe we should have some shared struct with vhost_user_blk for connectivity params?
>>>> 
>>>> In the future vhost-user-blk/scsi can be refactored to share the same code.
>>> 
>>> Sure - this can be done at some point in the future.
>>> 
>>>>> 
>>>>>> +    bool connected;
>>>>>> +
>>>>>> +    struct vhost_virtqueue *vhost_vqs;
>>>>>> };
>>>>>> 
>>>>>> #endif /* VHOST_USER_SCSI_H */
>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>>>> index 6a173cb9fa..b904346fe1 100644
>>>>>> --- a/include/hw/virtio/vhost.h
>>>>>> +++ b/include/hw/virtio/vhost.h
>>>>>> @@ -8,6 +8,8 @@
>>>>>> #define VHOST_F_DEVICE_IOTLB 63
>>>>>> #define VHOST_USER_F_PROTOCOL_FEATURES 30
>>>>>> 
>>>>> 
>>>>> Should the macro name indicate that this is for vhost-user?
>>>>> 
>>>>> VU_REALIZE_CONN_RETRIES? 
>>>> I will rename it in v2.
>>>> 
>>>>> 
>>>>>> +#define REALIZE_CONNECTION_RETRIES 3
>>>>>> +
>>>>>> /* Generic structures common for any vhost based device. */
>>>>>> 
>>>>>> struct vhost_inflight {
>>>>>> -- 
>>>>>> 2.41.0
>> 
>> Any comments about other patches?
> 
> I’ll send shortly.
diff mbox series

Patch

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index eecf3f7a81..f250c740b5 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -32,8 +32,6 @@ 
 #include "sysemu/sysemu.h"
 #include "sysemu/runstate.h"
 
-#define REALIZE_CONNECTION_RETRIES 3
-
 static const int user_feature_bits[] = {
     VIRTIO_BLK_F_SIZE_MAX,
     VIRTIO_BLK_F_SEG_MAX,
diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
index a06f01af26..08801886b8 100644
--- a/hw/scsi/vhost-scsi-common.c
+++ b/hw/scsi/vhost-scsi-common.c
@@ -52,16 +52,22 @@  int vhost_scsi_common_start(VHostSCSICommon *vsc)
 
     vsc->dev.acked_features = vdev->guest_features;
 
-    assert(vsc->inflight == NULL);
-    vsc->inflight = g_new0(struct vhost_inflight, 1);
-    ret = vhost_dev_get_inflight(&vsc->dev,
-                                 vs->conf.virtqueue_size,
-                                 vsc->inflight);
+    ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
     if (ret < 0) {
-        error_report("Error get inflight: %d", -ret);
+        error_report("Error setting inflight format: %d", -ret);
         goto err_guest_notifiers;
     }
 
+    if (!vsc->inflight->addr) {
+        ret = vhost_dev_get_inflight(&vsc->dev,
+                                    vs->conf.virtqueue_size,
+                                    vsc->inflight);
+        if (ret < 0) {
+            error_report("Error get inflight: %d", -ret);
+            goto err_guest_notifiers;
+        }
+    }
+
     ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
     if (ret < 0) {
         error_report("Error set inflight: %d", -ret);
@@ -85,9 +91,6 @@  int vhost_scsi_common_start(VHostSCSICommon *vsc)
     return ret;
 
 err_guest_notifiers:
-    g_free(vsc->inflight);
-    vsc->inflight = NULL;
-
     k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false);
 err_host_notifiers:
     vhost_dev_disable_notifiers(&vsc->dev, vdev);
@@ -111,12 +114,6 @@  void vhost_scsi_common_stop(VHostSCSICommon *vsc)
     }
     assert(ret >= 0);
 
-    if (vsc->inflight) {
-        vhost_dev_free_inflight(vsc->inflight);
-        g_free(vsc->inflight);
-        vsc->inflight = NULL;
-    }
-
     vhost_dev_disable_notifiers(&vsc->dev, vdev);
 }
 
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index ee99b19e7a..e0e88b0c42 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -89,14 +89,126 @@  static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
 }
 
+static int vhost_user_scsi_connect(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
+    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+    int ret = 0;
+
+    if (s->connected) {
+        return 0;
+    }
+    s->connected = true;
+
+    vsc->dev.num_queues = vs->conf.num_queues;
+    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
+    vsc->dev.vqs = s->vhost_vqs;
+    vsc->dev.vq_index = 0;
+    vsc->dev.backend_features = 0;
+
+    ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
+                         errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* restore vhost state */
+    if (virtio_device_started(vdev, vdev->status)) {
+        ret = vhost_scsi_common_start(vsc);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event);
+
+static void vhost_user_scsi_disconnect(DeviceState *dev)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
+    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+
+    if (!s->connected) {
+        return;
+    }
+    s->connected = false;
+
+    vhost_scsi_common_stop(vsc);
+
+    vhost_dev_cleanup(&vsc->dev);
+
+    /* Re-instate the event handler for new connections */
+    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
+                             vhost_user_scsi_event, NULL, dev, NULL, true);
+}
+
+static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event)
+{
+    DeviceState *dev = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
+    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+    Error *local_err = NULL;
+
+    switch (event) {
+    case CHR_EVENT_OPENED:
+        if (vhost_user_scsi_connect(dev, &local_err) < 0) {
+            error_report_err(local_err);
+            qemu_chr_fe_disconnect(&vs->conf.chardev);
+            return;
+        }
+        break;
+    case CHR_EVENT_CLOSED:
+        /* defer close until later to avoid circular close */
+        vhost_user_async_close(dev, &vs->conf.chardev, &vsc->dev,
+                               vhost_user_scsi_disconnect);
+        break;
+    case CHR_EVENT_BREAK:
+    case CHR_EVENT_MUX_IN:
+    case CHR_EVENT_MUX_OUT:
+        /* Ignore */
+        break;
+    }
+}
+
+static int vhost_user_scsi_realize_connect(VHostUserSCSI *s, Error **errp)
+{
+    DeviceState *dev = &s->parent_obj.parent_obj.parent_obj.parent_obj;
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+    int ret;
+
+    s->connected = false;
+
+    ret = qemu_chr_fe_wait_connected(&vs->conf.chardev, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = vhost_user_scsi_connect(dev, errp);
+    if (ret < 0) {
+        qemu_chr_fe_disconnect(&vs->conf.chardev);
+        return ret;
+    }
+    assert(s->connected);
+
+    return 0;
+}
+
 static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
 {
     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
     VHostUserSCSI *s = VHOST_USER_SCSI(dev);
     VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
-    struct vhost_virtqueue *vqs = NULL;
     Error *err = NULL;
     int ret;
+    int retries = REALIZE_CONNECTION_RETRIES;
 
     if (!vs->conf.chardev.chr) {
         error_setg(errp, "vhost-user-scsi: missing chardev");
@@ -112,21 +224,31 @@  static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
     }
 
     if (!vhost_user_init(&s->vhost_user, &vs->conf.chardev, errp)) {
-        goto free_virtio;
+        goto free_vhost;
     }
 
-    vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
-    vsc->dev.vqs = g_new0(struct vhost_virtqueue, vsc->dev.nvqs);
-    vsc->dev.vq_index = 0;
-    vsc->dev.backend_features = 0;
-    vqs = vsc->dev.vqs;
+    vsc->inflight = g_new0(struct vhost_inflight, 1);
+    s->vhost_vqs = g_new0(struct vhost_virtqueue,
+                          VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues);
+
+    assert(!*errp);
+    do {
+        if (*errp) {
+            error_prepend(errp, "Reconnecting after error: ");
+            error_report_err(*errp);
+            *errp = NULL;
+        }
+        ret = vhost_user_scsi_realize_connect(s, errp);
+    } while (ret < 0 && retries--);
 
-    ret = vhost_dev_init(&vsc->dev, &s->vhost_user,
-                         VHOST_BACKEND_TYPE_USER, 0, errp);
     if (ret < 0) {
-        goto free_vhost;
+        goto free_vqs;
     }
 
+    /* we're fully initialized, now we can operate, so add the handler */
+    qemu_chr_fe_set_handlers(&vs->conf.chardev,  NULL, NULL,
+                             vhost_user_scsi_event, NULL, (void *)dev,
+                             NULL, true);
     /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
     vsc->channel = 0;
     vsc->lun = 0;
@@ -134,10 +256,15 @@  static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
 
     return;
 
+free_vqs:
+    g_free(s->vhost_vqs);
+    s->vhost_vqs = NULL;
+    g_free(vsc->inflight);
+    vsc->inflight = NULL;
+
 free_vhost:
     vhost_user_cleanup(&s->vhost_user);
-    g_free(vqs);
-free_virtio:
+
     virtio_scsi_common_unrealize(dev);
 }
 
@@ -146,16 +273,22 @@  static void vhost_user_scsi_unrealize(DeviceState *dev)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VHostUserSCSI *s = VHOST_USER_SCSI(dev);
     VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
-    struct vhost_virtqueue *vqs = vsc->dev.vqs;
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
 
     /* This will stop the vhost backend. */
     vhost_user_scsi_set_status(vdev, 0);
+    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL, NULL, NULL, NULL,
+                             NULL, false);
 
     vhost_dev_cleanup(&vsc->dev);
-    g_free(vqs);
+    vhost_dev_free_inflight(vsc->inflight);
+    g_free(s->vhost_vqs);
+    s->vhost_vqs = NULL;
+    g_free(vsc->inflight);
+    vsc->inflight = NULL;
 
-    virtio_scsi_common_unrealize(dev);
     vhost_user_cleanup(&s->vhost_user);
+    virtio_scsi_common_unrealize(dev);
 }
 
 static Property vhost_user_scsi_properties[] = {
diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
index 521b08e559..c66acc68b7 100644
--- a/include/hw/virtio/vhost-user-scsi.h
+++ b/include/hw/virtio/vhost-user-scsi.h
@@ -29,6 +29,9 @@  OBJECT_DECLARE_SIMPLE_TYPE(VHostUserSCSI, VHOST_USER_SCSI)
 struct VHostUserSCSI {
     VHostSCSICommon parent_obj;
     VhostUserState vhost_user;
+    bool connected;
+
+    struct vhost_virtqueue *vhost_vqs;
 };
 
 #endif /* VHOST_USER_SCSI_H */
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 6a173cb9fa..b904346fe1 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -8,6 +8,8 @@ 
 #define VHOST_F_DEVICE_IOTLB 63
 #define VHOST_USER_F_PROTOCOL_FEATURES 30
 
+#define REALIZE_CONNECTION_RETRIES 3
+
 /* Generic structures common for any vhost based device. */
 
 struct vhost_inflight {