diff mbox series

[RFC,3/3] vhost-user: support programming VFIO group in master

Message ID 20180723045956.27521-4-tiwei.bie@intel.com
State New
Headers show
Series Supporting programming IOMMU in QEMU (vDPA/vhost-user) | expand

Commit Message

Tiwei Bie July 23, 2018, 4:59 a.m. UTC
Introduce a slave message to allow slave to share its
VFIO group fd to master and do the IOMMU programming
based on virtio device's DMA address space for this
group in QEMU.

For the vhost backends which support vDPA, they could
leverage this message to ask master to do the IOMMU
programming in QEMU for the vDPA device in backend.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 docs/interop/vhost-user.txt    | 16 ++++++++++++++
 hw/virtio/vhost-user.c         | 40 ++++++++++++++++++++++++++++++++++
 include/hw/virtio/vhost-user.h |  2 ++
 3 files changed, 58 insertions(+)

Comments

Michael S. Tsirkin July 23, 2018, 9:19 a.m. UTC | #1
On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote:
> Introduce a slave message to allow slave to share its
> VFIO group fd to master and do the IOMMU programming
> based on virtio device's DMA address space for this
> group in QEMU.
> 
> For the vhost backends which support vDPA, they could
> leverage this message to ask master to do the IOMMU
> programming in QEMU for the vDPA device in backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  docs/interop/vhost-user.txt    | 16 ++++++++++++++
>  hw/virtio/vhost-user.c         | 40 ++++++++++++++++++++++++++++++++++
>  include/hw/virtio/vhost-user.h |  2 ++
>  3 files changed, 58 insertions(+)
> 
> diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> index f59667f498..a57a8f9451 100644
> --- a/docs/interop/vhost-user.txt
> +++ b/docs/interop/vhost-user.txt
> @@ -397,6 +397,7 @@ Protocol features
>  #define VHOST_USER_PROTOCOL_F_CONFIG         9
>  #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
>  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
> +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP     12
>  
>  Master message types
>  --------------------
> @@ -815,6 +816,21 @@ Slave message types
>        This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
>        protocol feature has been successfully negotiated.
>  
> + * VHOST_USER_SLAVE_VFIO_GROUP_MSG
> +
> +      Id: 4
> +      Equivalent ioctl: N/A
> +      Slave payload: N/A
> +      Master payload: N/A
> +
> +      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
> +      could send this request to share its VFIO group fd via ancillary data
> +      to master. After receiving this request from slave, master will close
> +      the existing VFIO group if any and do the DMA programming based on the
> +      virtio device's DMA address space for the new group if the request is
> +      sent with a file descriptor.
> +

Should it also clear out any mappings that were set on the old fd
before closing it?

Also should we limit when can this message be received?
If not you need to re-program mappings again whenever
we get a new fd.

> +
>  VHOST_USER_PROTOCOL_F_REPLY_ACK:
>  -------------------------------
>  The original vhost-user specification only demands replies for certain
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index b041343632..db958e24c7 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_CONFIG = 9,
>      VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
>      VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +    VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12,
>      VHOST_USER_PROTOCOL_F_MAX
>  };
>  
> @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest {
>      VHOST_USER_SLAVE_IOTLB_MSG = 1,
>      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
>      VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> +    VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4,
>      VHOST_USER_SLAVE_MAX
>  }  VhostUserSlaveRequest;
>  
> @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
>      return 0;
>  }
>  
> +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> +                                              int *fd)
> +{
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserState *user = u->user;
> +    VirtIODevice *vdev = dev->vdev;
> +    int groupfd = fd[0];
> +    VFIOGroup *group;
> +
> +    if (!virtio_has_feature(dev->protocol_features,
> +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> +        vdev == NULL) {
> +        return -1;
> +    }
> +
> +    if (user->vfio_group) {
> +        vfio_put_group(user->vfio_group);
> +        user->vfio_group = NULL;
> +    }
> +
> +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> +    if (group == NULL) {
> +        return -1;
> +    }
> +
> +    if (group->fd != groupfd) {
> +        close(groupfd);
> +    }
> +
> +    user->vfio_group = group;
> +    fd[0] = -1;
> +
> +    return 0;
> +}
> +

What will cause propagating groups to this vfio fd?


>  static void slave_read(void *opaque)
>  {
>      struct vhost_dev *dev = opaque;
> @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque)
>          ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
>                                                            fd[0]);
>          break;
> +    case VHOST_USER_SLAVE_VFIO_GROUP_MSG:
> +        ret = vhost_user_slave_handle_vfio_group(dev, fd);
> +        break;
>      default:
>          error_report("Received unexpected msg type.");
>          ret = -EINVAL;
> diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> index fd660393a0..9e11473274 100644
> --- a/include/hw/virtio/vhost-user.h
> +++ b/include/hw/virtio/vhost-user.h
> @@ -10,6 +10,7 @@
>  
>  #include "chardev/char-fe.h"
>  #include "hw/virtio/virtio.h"
> +#include "hw/vfio/vfio-common.h"
>  
>  typedef struct VhostUserHostNotifier {
>      MemoryRegion mr;
> @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier {
>  typedef struct VhostUserState {
>      CharBackend *chr;
>      VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> +    VFIOGroup *vfio_group;
>  } VhostUserState;
>  
>  VhostUserState *vhost_user_init(void);
> -- 
> 2.18.0
Michael S. Tsirkin July 23, 2018, 9:20 a.m. UTC | #2
On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote:
> Introduce a slave message to allow slave to share its
> VFIO group fd to master and do the IOMMU programming
> based on virtio device's DMA address space for this
> group in QEMU.
> 
> For the vhost backends which support vDPA, they could
> leverage this message to ask master to do the IOMMU
> programming in QEMU for the vDPA device in backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  docs/interop/vhost-user.txt    | 16 ++++++++++++++
>  hw/virtio/vhost-user.c         | 40 ++++++++++++++++++++++++++++++++++
>  include/hw/virtio/vhost-user.h |  2 ++
>  3 files changed, 58 insertions(+)
> 
> diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> index f59667f498..a57a8f9451 100644
> --- a/docs/interop/vhost-user.txt
> +++ b/docs/interop/vhost-user.txt
> @@ -397,6 +397,7 @@ Protocol features
>  #define VHOST_USER_PROTOCOL_F_CONFIG         9
>  #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
>  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
> +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP     12
>  
>  Master message types
>  --------------------
> @@ -815,6 +816,21 @@ Slave message types
>        This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
>        protocol feature has been successfully negotiated.
>  
> + * VHOST_USER_SLAVE_VFIO_GROUP_MSG
> +
> +      Id: 4
> +      Equivalent ioctl: N/A
> +      Slave payload: N/A
> +      Master payload: N/A
> +
> +      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
> +      could send this request to share its VFIO group fd via ancillary data
> +      to master. After receiving this request from slave, master will close
> +      the existing VFIO group if any and do the DMA programming based on the
> +      virtio device's DMA address space for the new group if the request is
> +      sent with a file descriptor.
> +
> +
>  VHOST_USER_PROTOCOL_F_REPLY_ACK:
>  -------------------------------
>  The original vhost-user specification only demands replies for certain
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index b041343632..db958e24c7 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_CONFIG = 9,
>      VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
>      VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +    VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12,
>      VHOST_USER_PROTOCOL_F_MAX
>  };
>  
> @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest {
>      VHOST_USER_SLAVE_IOTLB_MSG = 1,
>      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
>      VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> +    VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4,
>      VHOST_USER_SLAVE_MAX
>  }  VhostUserSlaveRequest;
>  
> @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
>      return 0;
>  }
>  
> +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> +                                              int *fd)
> +{
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserState *user = u->user;
> +    VirtIODevice *vdev = dev->vdev;
> +    int groupfd = fd[0];
> +    VFIOGroup *group;
> +
> +    if (!virtio_has_feature(dev->protocol_features,
> +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> +        vdev == NULL) {
> +        return -1;
> +    }
> +
> +    if (user->vfio_group) {
> +        vfio_put_group(user->vfio_group);
> +        user->vfio_group = NULL;

Seems to create a window where mappings are invalid
even if the same fd is re-sent. Is that OK?

> +    }
> +
> +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> +    if (group == NULL) {
> +        return -1;
> +    }
> +
> +    if (group->fd != groupfd) {
> +        close(groupfd);
> +    }
> +
> +    user->vfio_group = group;
> +    fd[0] = -1;
> +
> +    return 0;
> +}
> +
>  static void slave_read(void *opaque)
>  {
>      struct vhost_dev *dev = opaque;
> @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque)
>          ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
>                                                            fd[0]);
>          break;
> +    case VHOST_USER_SLAVE_VFIO_GROUP_MSG:
> +        ret = vhost_user_slave_handle_vfio_group(dev, fd);
> +        break;
>      default:
>          error_report("Received unexpected msg type.");
>          ret = -EINVAL;
> diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> index fd660393a0..9e11473274 100644
> --- a/include/hw/virtio/vhost-user.h
> +++ b/include/hw/virtio/vhost-user.h
> @@ -10,6 +10,7 @@
>  
>  #include "chardev/char-fe.h"
>  #include "hw/virtio/virtio.h"
> +#include "hw/vfio/vfio-common.h"
>  
>  typedef struct VhostUserHostNotifier {
>      MemoryRegion mr;
> @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier {
>  typedef struct VhostUserState {
>      CharBackend *chr;
>      VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> +    VFIOGroup *vfio_group;
>  } VhostUserState;
>  
>  VhostUserState *vhost_user_init(void);
> -- 
> 2.18.0
Tiwei Bie July 23, 2018, 11:59 a.m. UTC | #3
On Mon, Jul 23, 2018 at 12:19:04PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote:
[...]
> > @@ -815,6 +816,21 @@ Slave message types
> >        This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
> >        protocol feature has been successfully negotiated.
> >  
> > + * VHOST_USER_SLAVE_VFIO_GROUP_MSG
> > +
> > +      Id: 4
> > +      Equivalent ioctl: N/A
> > +      Slave payload: N/A
> > +      Master payload: N/A
> > +
> > +      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
> > +      could send this request to share its VFIO group fd via ancillary data
> > +      to master. After receiving this request from slave, master will close
> > +      the existing VFIO group if any and do the DMA programming based on the
> > +      virtio device's DMA address space for the new group if the request is
> > +      sent with a file descriptor.
> > +
> 
> Should it also clear out any mappings that were set on the old fd
> before closing it?

Yes, more exactly it will do below things:

1. Delete VFIO group from KVM device fd;
2. Unset the container fd for this group fd;
3. Close this VFIO group fd;

Should I include above details in this doc?

> 
> Also should we limit when can this message be received?
> If not you need to re-program mappings again whenever
> we get a new fd.

To keep things simple, this proposal requires the slave
to assume the mappings are invalid before receiving the
REPLY from master when the slave sends this message to
master, and master will destroy the existing VFIO group
if any and do the setup for the (new) VFIO group if the
message carries a fd.

So if a VFIO group fd has been sent and the device has
been started, before sending a VFIO group fd (could be
the same fd that has been sent), the slave should stop
the device first and shouldn't assume the mappings are
valid before receiving the REPLY.

> 
> > +
[...]
> >  
> > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > +                                              int *fd)
> > +{
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserState *user = u->user;
> > +    VirtIODevice *vdev = dev->vdev;
> > +    int groupfd = fd[0];
> > +    VFIOGroup *group;
> > +
> > +    if (!virtio_has_feature(dev->protocol_features,
> > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > +        vdev == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (user->vfio_group) {
> > +        vfio_put_group(user->vfio_group);
> > +        user->vfio_group = NULL;
> > +    }
> > +
> > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > +    if (group == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (group->fd != groupfd) {
> > +        close(groupfd);
> > +    }
> > +
> > +    user->vfio_group = group;
> > +    fd[0] = -1;
> > +
> > +    return 0;
> > +}
> > +
> 
> What will cause propagating groups to this vfio fd?

Do you mean when a VFIOGroup will be created for this
VFIO group fd?

A VFIOGroup will be created if there is no existing
VFIOGroup that references to the same group id.

Best regards,
Tiwei Bie

> 
> 
[...]
Tiwei Bie July 23, 2018, 12:04 p.m. UTC | #4
On Mon, Jul 23, 2018 at 12:20:12PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote:
[...]
> >  
> > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > +                                              int *fd)
> > +{
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserState *user = u->user;
> > +    VirtIODevice *vdev = dev->vdev;
> > +    int groupfd = fd[0];
> > +    VFIOGroup *group;
> > +
> > +    if (!virtio_has_feature(dev->protocol_features,
> > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > +        vdev == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (user->vfio_group) {
> > +        vfio_put_group(user->vfio_group);
> > +        user->vfio_group = NULL;
> 
> Seems to create a window where mappings are invalid
> even if the same fd is re-sent. Is that OK?

Yeah, there will be a window that mappings are invalid
when the same fd is re-sent. Based on the proposal [1]
of this patch, it should be OK.

[1] http://lists.gnu.org/archive/html/qemu-devel/2018-07/msg04335.html
"""
To keep things simple, this proposal requires the slave
to assume the mappings are invalid before receiving the
REPLY from master when the slave sends this message to
master, and master will destroy the existing VFIO group
if any and do the setup for the (new) VFIO group if the
message carries a fd.

So if a VFIO group fd has been sent and the device has
been started, before sending a VFIO group fd (could be
the same fd that has been sent), the slave should stop
the device first and shouldn't assume the mappings are
valid before receiving the REPLY.
"""

Best regards,
Tiwei Bie

> 
> > +    }
[...]
Alex Williamson July 26, 2018, 8:45 p.m. UTC | #5
On Mon, 23 Jul 2018 12:59:56 +0800
Tiwei Bie <tiwei.bie@intel.com> wrote:

> Introduce a slave message to allow slave to share its
> VFIO group fd to master and do the IOMMU programming
> based on virtio device's DMA address space for this
> group in QEMU.
> 
> For the vhost backends which support vDPA, they could
> leverage this message to ask master to do the IOMMU
> programming in QEMU for the vDPA device in backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  docs/interop/vhost-user.txt    | 16 ++++++++++++++
>  hw/virtio/vhost-user.c         | 40 ++++++++++++++++++++++++++++++++++
>  include/hw/virtio/vhost-user.h |  2 ++
>  3 files changed, 58 insertions(+)
> 
> diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> index f59667f498..a57a8f9451 100644
> --- a/docs/interop/vhost-user.txt
> +++ b/docs/interop/vhost-user.txt
> @@ -397,6 +397,7 @@ Protocol features
>  #define VHOST_USER_PROTOCOL_F_CONFIG         9
>  #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
>  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
> +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP     12
>  
>  Master message types
>  --------------------
> @@ -815,6 +816,21 @@ Slave message types
>        This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
>        protocol feature has been successfully negotiated.
>  
> + * VHOST_USER_SLAVE_VFIO_GROUP_MSG
> +
> +      Id: 4
> +      Equivalent ioctl: N/A
> +      Slave payload: N/A
> +      Master payload: N/A
> +
> +      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
> +      could send this request to share its VFIO group fd via ancillary data
> +      to master. After receiving this request from slave, master will close
> +      the existing VFIO group if any and do the DMA programming based on the
> +      virtio device's DMA address space for the new group if the request is
> +      sent with a file descriptor.
> +
> +
>  VHOST_USER_PROTOCOL_F_REPLY_ACK:
>  -------------------------------
>  The original vhost-user specification only demands replies for certain
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index b041343632..db958e24c7 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_CONFIG = 9,
>      VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
>      VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +    VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12,
>      VHOST_USER_PROTOCOL_F_MAX
>  };
>  
> @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest {
>      VHOST_USER_SLAVE_IOTLB_MSG = 1,
>      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
>      VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> +    VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4,
>      VHOST_USER_SLAVE_MAX
>  }  VhostUserSlaveRequest;
>  
> @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
>      return 0;
>  }
>  
> +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> +                                              int *fd)
> +{
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserState *user = u->user;
> +    VirtIODevice *vdev = dev->vdev;
> +    int groupfd = fd[0];
> +    VFIOGroup *group;
> +
> +    if (!virtio_has_feature(dev->protocol_features,
> +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> +        vdev == NULL) {
> +        return -1;
> +    }
> +
> +    if (user->vfio_group) {
> +        vfio_put_group(user->vfio_group);
> +        user->vfio_group = NULL;
> +    }
> +
> +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> +    if (group == NULL) {
> +        return -1;
> +    }
> +
> +    if (group->fd != groupfd) {
> +        close(groupfd);
> +    }
> +
> +    user->vfio_group = group;
> +    fd[0] = -1;
> +
> +    return 0;
> +}

This all looks very sketchy, we're reaching into vfio internal state
and arbitrarily releasing data structures, reusing existing ones, and
maybe creating new ones.  We know that a vfio group only allows a
single open, right?  So if you have a groupfd, vfio will never have
that same group opened already.  Is that the goal?  It's not obvious in
the code.  I don't really understand what vhost goes on to do with this
group, but I'm pretty sure the existing state machine in vfio isn't
designed for it.  Thanks,

Alex

> +
>  static void slave_read(void *opaque)
>  {
>      struct vhost_dev *dev = opaque;
> @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque)
>          ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
>                                                            fd[0]);
>          break;
> +    case VHOST_USER_SLAVE_VFIO_GROUP_MSG:
> +        ret = vhost_user_slave_handle_vfio_group(dev, fd);
> +        break;
>      default:
>          error_report("Received unexpected msg type.");
>          ret = -EINVAL;
> diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> index fd660393a0..9e11473274 100644
> --- a/include/hw/virtio/vhost-user.h
> +++ b/include/hw/virtio/vhost-user.h
> @@ -10,6 +10,7 @@
>  
>  #include "chardev/char-fe.h"
>  #include "hw/virtio/virtio.h"
> +#include "hw/vfio/vfio-common.h"
>  
>  typedef struct VhostUserHostNotifier {
>      MemoryRegion mr;
> @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier {
>  typedef struct VhostUserState {
>      CharBackend *chr;
>      VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> +    VFIOGroup *vfio_group;
>  } VhostUserState;
>  
>  VhostUserState *vhost_user_init(void);
Tiwei Bie July 27, 2018, 1:58 a.m. UTC | #6
On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> On Mon, 23 Jul 2018 12:59:56 +0800
> Tiwei Bie <tiwei.bie@intel.com> wrote:
> 
[...]
> >  
> > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > +                                              int *fd)
> > +{
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserState *user = u->user;
> > +    VirtIODevice *vdev = dev->vdev;
> > +    int groupfd = fd[0];
> > +    VFIOGroup *group;
> > +
> > +    if (!virtio_has_feature(dev->protocol_features,
> > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > +        vdev == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (user->vfio_group) {
> > +        vfio_put_group(user->vfio_group);
> > +        user->vfio_group = NULL;
> > +    }
> > +

There should be a below check here (I missed it in this
patch, sorry..):

    if (groupfd < 0) {
        return 0;
    }

> > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > +    if (group == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (group->fd != groupfd) {
> > +        close(groupfd);
> > +    }
> > +
> > +    user->vfio_group = group;
> > +    fd[0] = -1;
> > +
> > +    return 0;
> > +}
> 
> This all looks very sketchy, we're reaching into vfio internal state
> and arbitrarily releasing data structures, reusing existing ones, and
> maybe creating new ones.  We know that a vfio group only allows a
> single open, right?

Yeah, exactly!

When the vhost-user backend process has opened a VFIO group fd,
QEMU won't be able to open this group file via open() any more.
So vhost-user backend process will share the VFIO group fd to
QEMU via the UNIX socket. And that's why we're introducing a
new API:

	vfio_get_group_from_fd(groupfd, ...);

for vfio/common.c to get (setup) a VFIOGroup structure. (Because
in this case, vfio/common.c can't open this group file via open(),
and what we have is a VFIO group fd shared by another process via
UNIX socket). And ...

> So if you have a groupfd, vfio will never have
> that same group opened already.

... if the same vhost-user backend process shares the same VFIO
group fd more than one times via different vhost-user slave channels
to different vhost-user frontends in the same QEMU process, the
same VFIOGroup could exist already.

This case could happen when e.g. there are two vhost accelerator
devices in the same VFIO group, and they are managed by the same
vhost-user backend process, and used to accelerate two different
virtio devices in QEMU. In this case, the same VFIO group fd could
be sent twice.

If we need to support this case, more changes in vfio/common.c
will be needed, e.g. 1) add a refcnt in VFIOGroup to support
getting and putting a VFIOGroup structure multiple times,
2) add a lock to make vfio_get_group*() and vfio_put_group()
thread-safe.

> Is that the goal?  It's not obvious in
> the code.  I don't really understand what vhost goes on to do with this
> group,

The goal is that, we have a VFIO group opened by an external
vhost-user backend process, this process will manage the VFIO
device, but want QEMU to manage the VFIO group, including:

1 - program the DMA mappings for this VFIO group based on
    the front-end device's dma_as in QEMU.

2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).

Vhost-user in QEMU as the vhost-user frontend just wants
hw/vfio to do above things after receiving a VFIO group fd
from the vhost-user backend process.

Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
by calling vfio_get_group_from_fd() or put this VFIOGroup by
calling vfio_put_group() when it's not needed anymore, and
won't do anything else.

Vhost-user will only deal with the VFIOGroups allocated by
itself, and won't influence any other VFIOGroups used by the
VFIO passthru devices (-device vfio-pci). Because the same
VFIO group file can only be opened by one process via open().

> but I'm pretty sure the existing state machine in vfio isn't
> designed for it.  Thanks,

Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
is going to use VFIOGroup and how we are going to extend hw/vfio)
acceptable to you? Thanks!

Best regards,
Tiwei Bie
Alex Williamson July 27, 2018, 8:03 p.m. UTC | #7
On Fri, 27 Jul 2018 09:58:05 +0800
Tiwei Bie <tiwei.bie@intel.com> wrote:

> On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> > On Mon, 23 Jul 2018 12:59:56 +0800
> > Tiwei Bie <tiwei.bie@intel.com> wrote:
> >   
> [...]
> > >  
> > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > +                                              int *fd)
> > > +{
> > > +    struct vhost_user *u = dev->opaque;
> > > +    VhostUserState *user = u->user;
> > > +    VirtIODevice *vdev = dev->vdev;
> > > +    int groupfd = fd[0];
> > > +    VFIOGroup *group;
> > > +
> > > +    if (!virtio_has_feature(dev->protocol_features,
> > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > +        vdev == NULL) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (user->vfio_group) {
> > > +        vfio_put_group(user->vfio_group);
> > > +        user->vfio_group = NULL;
> > > +    }
> > > +  
> 
> There should be a below check here (I missed it in this
> patch, sorry..):
> 
>     if (groupfd < 0) {
>         return 0;
>     }
> 
> > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > +    if (group == NULL) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (group->fd != groupfd) {
> > > +        close(groupfd);
> > > +    }
> > > +
> > > +    user->vfio_group = group;
> > > +    fd[0] = -1;
> > > +
> > > +    return 0;
> > > +}  
> > 
> > This all looks very sketchy, we're reaching into vfio internal state
> > and arbitrarily releasing data structures, reusing existing ones, and
> > maybe creating new ones.  We know that a vfio group only allows a
> > single open, right?  
> 
> Yeah, exactly!
> 
> When the vhost-user backend process has opened a VFIO group fd,
> QEMU won't be able to open this group file via open() any more.
> So vhost-user backend process will share the VFIO group fd to
> QEMU via the UNIX socket. And that's why we're introducing a
> new API:
> 
> 	vfio_get_group_from_fd(groupfd, ...);
> 
> for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> in this case, vfio/common.c can't open this group file via open(),
> and what we have is a VFIO group fd shared by another process via
> UNIX socket). And ...
> 
> > So if you have a groupfd, vfio will never have
> > that same group opened already.  
> 
> ... if the same vhost-user backend process shares the same VFIO
> group fd more than one times via different vhost-user slave channels
> to different vhost-user frontends in the same QEMU process, the
> same VFIOGroup could exist already.
> 
> This case could happen when e.g. there are two vhost accelerator
> devices in the same VFIO group, and they are managed by the same
> vhost-user backend process, and used to accelerate two different
> virtio devices in QEMU. In this case, the same VFIO group fd could
> be sent twice.
> 
> If we need to support this case, more changes in vfio/common.c
> will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> getting and putting a VFIOGroup structure multiple times,
> 2) add a lock to make vfio_get_group*() and vfio_put_group()
> thread-safe.

This is the sort of case where it seems like we're just grabbing
internal interfaces and using them from other modules.  VFIOGroups have
a device_list and currently handles multiple puts by testing whether
that device list is empty (ie. we already have a refcnt effectively).
Now we have a group user that has no devices, which is not something
that it seems like we've designed or audited the code for handling.
IOW, while the header file lives in a common location, it's not really
intended to be an API within QEMU and it makes me a bit uncomfortable
to use it as such.

> > Is that the goal?  It's not obvious in
> > the code.  I don't really understand what vhost goes on to do with this
> > group,  
> 
> The goal is that, we have a VFIO group opened by an external
> vhost-user backend process, this process will manage the VFIO
> device, but want QEMU to manage the VFIO group, including:
> 
> 1 - program the DMA mappings for this VFIO group based on
>     the front-end device's dma_as in QEMU.
> 
> 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> 
> Vhost-user in QEMU as the vhost-user frontend just wants
> hw/vfio to do above things after receiving a VFIO group fd
> from the vhost-user backend process.
> 
> Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> by calling vfio_get_group_from_fd() or put this VFIOGroup by
> calling vfio_put_group() when it's not needed anymore, and
> won't do anything else.
> 
> Vhost-user will only deal with the VFIOGroups allocated by
> itself, and won't influence any other VFIOGroups used by the
> VFIO passthru devices (-device vfio-pci). Because the same
> VFIO group file can only be opened by one process via open().

But there's nothing here that guarantees the reverse.  vhost cannot
open the group of an existing vfio-pci device, but vfio-pci can re-use
the group that vhost has already opened.  This is again where it seems
like we're trying to cherry pick from an internal API and leaving some
loose ends dangling.

> > but I'm pretty sure the existing state machine in vfio isn't
> > designed for it.  Thanks,  
> 
> Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> is going to use VFIOGroup and how we are going to extend hw/vfio)
> acceptable to you? Thanks!

I certainly would not want to duplicate managing the group, container,
and MemoryListener, but I think we need a more formal interface with at
least some notion of reference counting beyond the device list or
explicit knowledge of an external user.  I'm also curious if it really
makes sense that the vhost backend is opening the group rather than
letting QEMU do it.  It seems that vhost wants to deal with the device
and we can never have symmetry in the scenario above, where a group
might contain multiple devices and order matters because of where the
group is opened.  If we still had a notion of a VFIODevice for an
external user, then the device_list refcnt would just work.  Thanks,

Alex
Tiwei Bie July 30, 2018, 8:10 a.m. UTC | #8
On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote:
> On Fri, 27 Jul 2018 09:58:05 +0800
> Tiwei Bie <tiwei.bie@intel.com> wrote:
> 
> > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> > > On Mon, 23 Jul 2018 12:59:56 +0800
> > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > >   
> > [...]
> > > >  
> > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > > +                                              int *fd)
> > > > +{
> > > > +    struct vhost_user *u = dev->opaque;
> > > > +    VhostUserState *user = u->user;
> > > > +    VirtIODevice *vdev = dev->vdev;
> > > > +    int groupfd = fd[0];
> > > > +    VFIOGroup *group;
> > > > +
> > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > > +        vdev == NULL) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (user->vfio_group) {
> > > > +        vfio_put_group(user->vfio_group);
> > > > +        user->vfio_group = NULL;
> > > > +    }
> > > > +  
> > 
> > There should be a below check here (I missed it in this
> > patch, sorry..):
> > 
> >     if (groupfd < 0) {
> >         return 0;
> >     }
> > 
> > > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > > +    if (group == NULL) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (group->fd != groupfd) {
> > > > +        close(groupfd);
> > > > +    }
> > > > +
> > > > +    user->vfio_group = group;
> > > > +    fd[0] = -1;
> > > > +
> > > > +    return 0;
> > > > +}  
> > > 
> > > This all looks very sketchy, we're reaching into vfio internal state
> > > and arbitrarily releasing data structures, reusing existing ones, and
> > > maybe creating new ones.  We know that a vfio group only allows a
> > > single open, right?  
> > 
> > Yeah, exactly!
> > 
> > When the vhost-user backend process has opened a VFIO group fd,
> > QEMU won't be able to open this group file via open() any more.
> > So vhost-user backend process will share the VFIO group fd to
> > QEMU via the UNIX socket. And that's why we're introducing a
> > new API:
> > 
> > 	vfio_get_group_from_fd(groupfd, ...);
> > 
> > for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> > in this case, vfio/common.c can't open this group file via open(),
> > and what we have is a VFIO group fd shared by another process via
> > UNIX socket). And ...
> > 
> > > So if you have a groupfd, vfio will never have
> > > that same group opened already.  
> > 
> > ... if the same vhost-user backend process shares the same VFIO
> > group fd more than one times via different vhost-user slave channels
> > to different vhost-user frontends in the same QEMU process, the
> > same VFIOGroup could exist already.
> > 
> > This case could happen when e.g. there are two vhost accelerator
> > devices in the same VFIO group, and they are managed by the same
> > vhost-user backend process, and used to accelerate two different
> > virtio devices in QEMU. In this case, the same VFIO group fd could
> > be sent twice.
> > 
> > If we need to support this case, more changes in vfio/common.c
> > will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> > getting and putting a VFIOGroup structure multiple times,
> > 2) add a lock to make vfio_get_group*() and vfio_put_group()
> > thread-safe.
> 
> This is the sort of case where it seems like we're just grabbing
> internal interfaces and using them from other modules.  VFIOGroups have
> a device_list and currently handles multiple puts by testing whether
> that device list is empty (ie. we already have a refcnt effectively).
> Now we have a group user that has no devices, which is not something
> that it seems like we've designed or audited the code for handling.
> IOW, while the header file lives in a common location, it's not really
> intended to be an API within QEMU and it makes me a bit uncomfortable
> to use it as such.
> 
> > > Is that the goal?  It's not obvious in
> > > the code.  I don't really understand what vhost goes on to do with this
> > > group,  
> > 
> > The goal is that, we have a VFIO group opened by an external
> > vhost-user backend process, this process will manage the VFIO
> > device, but want QEMU to manage the VFIO group, including:
> > 
> > 1 - program the DMA mappings for this VFIO group based on
> >     the front-end device's dma_as in QEMU.
> > 
> > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> > 
> > Vhost-user in QEMU as the vhost-user frontend just wants
> > hw/vfio to do above things after receiving a VFIO group fd
> > from the vhost-user backend process.
> > 
> > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> > by calling vfio_get_group_from_fd() or put this VFIOGroup by
> > calling vfio_put_group() when it's not needed anymore, and
> > won't do anything else.
> > 
> > Vhost-user will only deal with the VFIOGroups allocated by
> > itself, and won't influence any other VFIOGroups used by the
> > VFIO passthru devices (-device vfio-pci). Because the same
> > VFIO group file can only be opened by one process via open().
> 
> But there's nothing here that guarantees the reverse.  vhost cannot
> open the group of an existing vfio-pci device, but vfio-pci can re-use
> the group that vhost has already opened.  This is again where it seems
> like we're trying to cherry pick from an internal API and leaving some
> loose ends dangling.
> 
> > > but I'm pretty sure the existing state machine in vfio isn't
> > > designed for it.  Thanks,  
> > 
> > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> > is going to use VFIOGroup and how we are going to extend hw/vfio)
> > acceptable to you? Thanks!
> 
> I certainly would not want to duplicate managing the group, container,
> and MemoryListener, but I think we need a more formal interface with at
> least some notion of reference counting beyond the device list or
> explicit knowledge of an external user.

Got it! I'll try to address this. Thanks!

> I'm also curious if it really
> makes sense that the vhost backend is opening the group rather than
> letting QEMU do it.  It seems that vhost wants to deal with the device
> and we can never have symmetry in the scenario above, where a group
> might contain multiple devices and order matters because of where the
> group is opened.  If we still had a notion of a VFIODevice for an
> external user, then the device_list refcnt would just work.  Thanks,

Vhost-user backend will but QEMU won't open the VFIO
fds, because QEMU just has a vhost-user UNIX socket
path that it should connect to or listen on. So QEMU
doesn't know which group and device it should open.
The vhost accelerator devices are probed and managed
in the vhost-user backend process. And the vhost-user
backend process will bind the vhost-user instances
and vhost accelerator devices based on some sort of
user's input.

Best regards,
Tiwei Bie
Michael S. Tsirkin July 30, 2018, 9:30 a.m. UTC | #9
On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote:
> On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote:
> > On Fri, 27 Jul 2018 09:58:05 +0800
> > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > 
> > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> > > > On Mon, 23 Jul 2018 12:59:56 +0800
> > > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > > >   
> > > [...]
> > > > >  
> > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > > > +                                              int *fd)
> > > > > +{
> > > > > +    struct vhost_user *u = dev->opaque;
> > > > > +    VhostUserState *user = u->user;
> > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > +    int groupfd = fd[0];
> > > > > +    VFIOGroup *group;
> > > > > +
> > > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > > > +        vdev == NULL) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (user->vfio_group) {
> > > > > +        vfio_put_group(user->vfio_group);
> > > > > +        user->vfio_group = NULL;
> > > > > +    }
> > > > > +  
> > > 
> > > There should be a below check here (I missed it in this
> > > patch, sorry..):
> > > 
> > >     if (groupfd < 0) {
> > >         return 0;
> > >     }
> > > 
> > > > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > > > +    if (group == NULL) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (group->fd != groupfd) {
> > > > > +        close(groupfd);
> > > > > +    }
> > > > > +
> > > > > +    user->vfio_group = group;
> > > > > +    fd[0] = -1;
> > > > > +
> > > > > +    return 0;
> > > > > +}  
> > > > 
> > > > This all looks very sketchy, we're reaching into vfio internal state
> > > > and arbitrarily releasing data structures, reusing existing ones, and
> > > > maybe creating new ones.  We know that a vfio group only allows a
> > > > single open, right?  
> > > 
> > > Yeah, exactly!
> > > 
> > > When the vhost-user backend process has opened a VFIO group fd,
> > > QEMU won't be able to open this group file via open() any more.
> > > So vhost-user backend process will share the VFIO group fd to
> > > QEMU via the UNIX socket. And that's why we're introducing a
> > > new API:
> > > 
> > > 	vfio_get_group_from_fd(groupfd, ...);
> > > 
> > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> > > in this case, vfio/common.c can't open this group file via open(),
> > > and what we have is a VFIO group fd shared by another process via
> > > UNIX socket). And ...
> > > 
> > > > So if you have a groupfd, vfio will never have
> > > > that same group opened already.  
> > > 
> > > ... if the same vhost-user backend process shares the same VFIO
> > > group fd more than one times via different vhost-user slave channels
> > > to different vhost-user frontends in the same QEMU process, the
> > > same VFIOGroup could exist already.
> > > 
> > > This case could happen when e.g. there are two vhost accelerator
> > > devices in the same VFIO group, and they are managed by the same
> > > vhost-user backend process, and used to accelerate two different
> > > virtio devices in QEMU. In this case, the same VFIO group fd could
> > > be sent twice.
> > > 
> > > If we need to support this case, more changes in vfio/common.c
> > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> > > getting and putting a VFIOGroup structure multiple times,
> > > 2) add a lock to make vfio_get_group*() and vfio_put_group()
> > > thread-safe.
> > 
> > This is the sort of case where it seems like we're just grabbing
> > internal interfaces and using them from other modules.  VFIOGroups have
> > a device_list and currently handles multiple puts by testing whether
> > that device list is empty (ie. we already have a refcnt effectively).
> > Now we have a group user that has no devices, which is not something
> > that it seems like we've designed or audited the code for handling.
> > IOW, while the header file lives in a common location, it's not really
> > intended to be an API within QEMU and it makes me a bit uncomfortable
> > to use it as such.
> > 
> > > > Is that the goal?  It's not obvious in
> > > > the code.  I don't really understand what vhost goes on to do with this
> > > > group,  
> > > 
> > > The goal is that, we have a VFIO group opened by an external
> > > vhost-user backend process, this process will manage the VFIO
> > > device, but want QEMU to manage the VFIO group, including:
> > > 
> > > 1 - program the DMA mappings for this VFIO group based on
> > >     the front-end device's dma_as in QEMU.
> > > 
> > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> > > 
> > > Vhost-user in QEMU as the vhost-user frontend just wants
> > > hw/vfio to do above things after receiving a VFIO group fd
> > > from the vhost-user backend process.
> > > 
> > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> > > by calling vfio_get_group_from_fd() or put this VFIOGroup by
> > > calling vfio_put_group() when it's not needed anymore, and
> > > won't do anything else.
> > > 
> > > Vhost-user will only deal with the VFIOGroups allocated by
> > > itself, and won't influence any other VFIOGroups used by the
> > > VFIO passthru devices (-device vfio-pci). Because the same
> > > VFIO group file can only be opened by one process via open().
> > 
> > But there's nothing here that guarantees the reverse.  vhost cannot
> > open the group of an existing vfio-pci device, but vfio-pci can re-use
> > the group that vhost has already opened.  This is again where it seems
> > like we're trying to cherry pick from an internal API and leaving some
> > loose ends dangling.
> > 
> > > > but I'm pretty sure the existing state machine in vfio isn't
> > > > designed for it.  Thanks,  
> > > 
> > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> > > is going to use VFIOGroup and how we are going to extend hw/vfio)
> > > acceptable to you? Thanks!
> > 
> > I certainly would not want to duplicate managing the group, container,
> > and MemoryListener, but I think we need a more formal interface with at
> > least some notion of reference counting beyond the device list or
> > explicit knowledge of an external user.
> 
> Got it! I'll try to address this. Thanks!
> 
> > I'm also curious if it really
> > makes sense that the vhost backend is opening the group rather than
> > letting QEMU do it.  It seems that vhost wants to deal with the device
> > and we can never have symmetry in the scenario above, where a group
> > might contain multiple devices and order matters because of where the
> > group is opened.  If we still had a notion of a VFIODevice for an
> > external user, then the device_list refcnt would just work.  Thanks,
> 
> Vhost-user backend will but QEMU won't open the VFIO
> fds, because QEMU just has a vhost-user UNIX socket
> path that it should connect to or listen on. So QEMU
> doesn't know which group and device it should open.
> The vhost accelerator devices are probed and managed
> in the vhost-user backend process. And the vhost-user
> backend process will bind the vhost-user instances
> and vhost accelerator devices based on some sort of
> user's input.
> 
> Best regards,
> Tiwei Bie

Is the reason it's done like this because the
backend is sharing the device with multiple QEMUs?

I generally wonder how are restarts of the backend handled
with this approach: closing the VFIO device tends to reset
the whole device.
Alex Williamson July 30, 2018, 4:20 p.m. UTC | #10
On Mon, 30 Jul 2018 12:30:58 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote:
> > On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote:  
> > > On Fri, 27 Jul 2018 09:58:05 +0800
> > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > >   
> > > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:  
> > > > > On Mon, 23 Jul 2018 12:59:56 +0800
> > > > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > > > >     
> > > > [...]  
> > > > > >  
> > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > > > > +                                              int *fd)
> > > > > > +{
> > > > > > +    struct vhost_user *u = dev->opaque;
> > > > > > +    VhostUserState *user = u->user;
> > > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > > +    int groupfd = fd[0];
> > > > > > +    VFIOGroup *group;
> > > > > > +
> > > > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > > > > +        vdev == NULL) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (user->vfio_group) {
> > > > > > +        vfio_put_group(user->vfio_group);
> > > > > > +        user->vfio_group = NULL;
> > > > > > +    }
> > > > > > +    
> > > > 
> > > > There should be a below check here (I missed it in this
> > > > patch, sorry..):
> > > > 
> > > >     if (groupfd < 0) {
> > > >         return 0;
> > > >     }
> > > >   
> > > > > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > > > > +    if (group == NULL) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (group->fd != groupfd) {
> > > > > > +        close(groupfd);
> > > > > > +    }
> > > > > > +
> > > > > > +    user->vfio_group = group;
> > > > > > +    fd[0] = -1;
> > > > > > +
> > > > > > +    return 0;
> > > > > > +}    
> > > > > 
> > > > > This all looks very sketchy, we're reaching into vfio internal state
> > > > > and arbitrarily releasing data structures, reusing existing ones, and
> > > > > maybe creating new ones.  We know that a vfio group only allows a
> > > > > single open, right?    
> > > > 
> > > > Yeah, exactly!
> > > > 
> > > > When the vhost-user backend process has opened a VFIO group fd,
> > > > QEMU won't be able to open this group file via open() any more.
> > > > So vhost-user backend process will share the VFIO group fd to
> > > > QEMU via the UNIX socket. And that's why we're introducing a
> > > > new API:
> > > > 
> > > > 	vfio_get_group_from_fd(groupfd, ...);
> > > > 
> > > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> > > > in this case, vfio/common.c can't open this group file via open(),
> > > > and what we have is a VFIO group fd shared by another process via
> > > > UNIX socket). And ...
> > > >   
> > > > > So if you have a groupfd, vfio will never have
> > > > > that same group opened already.    
> > > > 
> > > > ... if the same vhost-user backend process shares the same VFIO
> > > > group fd more than one times via different vhost-user slave channels
> > > > to different vhost-user frontends in the same QEMU process, the
> > > > same VFIOGroup could exist already.
> > > > 
> > > > This case could happen when e.g. there are two vhost accelerator
> > > > devices in the same VFIO group, and they are managed by the same
> > > > vhost-user backend process, and used to accelerate two different
> > > > virtio devices in QEMU. In this case, the same VFIO group fd could
> > > > be sent twice.
> > > > 
> > > > If we need to support this case, more changes in vfio/common.c
> > > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> > > > getting and putting a VFIOGroup structure multiple times,
> > > > 2) add a lock to make vfio_get_group*() and vfio_put_group()
> > > > thread-safe.  
> > > 
> > > This is the sort of case where it seems like we're just grabbing
> > > internal interfaces and using them from other modules.  VFIOGroups have
> > > a device_list and currently handles multiple puts by testing whether
> > > that device list is empty (ie. we already have a refcnt effectively).
> > > Now we have a group user that has no devices, which is not something
> > > that it seems like we've designed or audited the code for handling.
> > > IOW, while the header file lives in a common location, it's not really
> > > intended to be an API within QEMU and it makes me a bit uncomfortable
> > > to use it as such.
> > >   
> > > > > Is that the goal?  It's not obvious in
> > > > > the code.  I don't really understand what vhost goes on to do with this
> > > > > group,    
> > > > 
> > > > The goal is that, we have a VFIO group opened by an external
> > > > vhost-user backend process, this process will manage the VFIO
> > > > device, but want QEMU to manage the VFIO group, including:
> > > > 
> > > > 1 - program the DMA mappings for this VFIO group based on
> > > >     the front-end device's dma_as in QEMU.
> > > > 
> > > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> > > > 
> > > > Vhost-user in QEMU as the vhost-user frontend just wants
> > > > hw/vfio to do above things after receiving a VFIO group fd
> > > > from the vhost-user backend process.
> > > > 
> > > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> > > > by calling vfio_get_group_from_fd() or put this VFIOGroup by
> > > > calling vfio_put_group() when it's not needed anymore, and
> > > > won't do anything else.
> > > > 
> > > > Vhost-user will only deal with the VFIOGroups allocated by
> > > > itself, and won't influence any other VFIOGroups used by the
> > > > VFIO passthru devices (-device vfio-pci). Because the same
> > > > VFIO group file can only be opened by one process via open().  
> > > 
> > > But there's nothing here that guarantees the reverse.  vhost cannot
> > > open the group of an existing vfio-pci device, but vfio-pci can re-use
> > > the group that vhost has already opened.  This is again where it seems
> > > like we're trying to cherry pick from an internal API and leaving some
> > > loose ends dangling.
> > >   
> > > > > but I'm pretty sure the existing state machine in vfio isn't
> > > > > designed for it.  Thanks,    
> > > > 
> > > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> > > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> > > > is going to use VFIOGroup and how we are going to extend hw/vfio)
> > > > acceptable to you? Thanks!  
> > > 
> > > I certainly would not want to duplicate managing the group, container,
> > > and MemoryListener, but I think we need a more formal interface with at
> > > least some notion of reference counting beyond the device list or
> > > explicit knowledge of an external user.  
> > 
> > Got it! I'll try to address this. Thanks!
> >   
> > > I'm also curious if it really
> > > makes sense that the vhost backend is opening the group rather than
> > > letting QEMU do it.  It seems that vhost wants to deal with the device
> > > and we can never have symmetry in the scenario above, where a group
> > > might contain multiple devices and order matters because of where the
> > > group is opened.  If we still had a notion of a VFIODevice for an
> > > external user, then the device_list refcnt would just work.  Thanks,  
> > 
> > Vhost-user backend will but QEMU won't open the VFIO
> > fds, because QEMU just has a vhost-user UNIX socket
> > path that it should connect to or listen on. So QEMU
> > doesn't know which group and device it should open.
> > The vhost accelerator devices are probed and managed
> > in the vhost-user backend process. And the vhost-user
> > backend process will bind the vhost-user instances
> > and vhost accelerator devices based on some sort of
> > user's input.
> > 
> > Best regards,
> > Tiwei Bie  
> 
> Is the reason it's done like this because the
> backend is sharing the device with multiple QEMUs?

Each QEMU instance that gets passed a vfio group would attempt to add
it to a vfio container (ie. IOMMU domain), so this mechanism cannot
support multiple QEMU instances attached to the same group.  Beyond
that, the address space of each device would need to be unique within
the instance as we only have a single IOVA space for the group.  Thanks,

Alex
Tiwei Bie July 31, 2018, 7:47 a.m. UTC | #11
On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote:
> > On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote:
> > > On Fri, 27 Jul 2018 09:58:05 +0800
> > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > > 
> > > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> > > > > On Mon, 23 Jul 2018 12:59:56 +0800
> > > > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > > > >   
> > > > [...]
> > > > > >  
> > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > > > > +                                              int *fd)
> > > > > > +{
> > > > > > +    struct vhost_user *u = dev->opaque;
> > > > > > +    VhostUserState *user = u->user;
> > > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > > +    int groupfd = fd[0];
> > > > > > +    VFIOGroup *group;
> > > > > > +
> > > > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > > > > +        vdev == NULL) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (user->vfio_group) {
> > > > > > +        vfio_put_group(user->vfio_group);
> > > > > > +        user->vfio_group = NULL;
> > > > > > +    }
> > > > > > +  
> > > > 
> > > > There should be a below check here (I missed it in this
> > > > patch, sorry..):
> > > > 
> > > >     if (groupfd < 0) {
> > > >         return 0;
> > > >     }
> > > > 
> > > > > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > > > > +    if (group == NULL) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (group->fd != groupfd) {
> > > > > > +        close(groupfd);
> > > > > > +    }
> > > > > > +
> > > > > > +    user->vfio_group = group;
> > > > > > +    fd[0] = -1;
> > > > > > +
> > > > > > +    return 0;
> > > > > > +}  
> > > > > 
> > > > > This all looks very sketchy, we're reaching into vfio internal state
> > > > > and arbitrarily releasing data structures, reusing existing ones, and
> > > > > maybe creating new ones.  We know that a vfio group only allows a
> > > > > single open, right?  
> > > > 
> > > > Yeah, exactly!
> > > > 
> > > > When the vhost-user backend process has opened a VFIO group fd,
> > > > QEMU won't be able to open this group file via open() any more.
> > > > So vhost-user backend process will share the VFIO group fd to
> > > > QEMU via the UNIX socket. And that's why we're introducing a
> > > > new API:
> > > > 
> > > > 	vfio_get_group_from_fd(groupfd, ...);
> > > > 
> > > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> > > > in this case, vfio/common.c can't open this group file via open(),
> > > > and what we have is a VFIO group fd shared by another process via
> > > > UNIX socket). And ...
> > > > 
> > > > > So if you have a groupfd, vfio will never have
> > > > > that same group opened already.  
> > > > 
> > > > ... if the same vhost-user backend process shares the same VFIO
> > > > group fd more than one times via different vhost-user slave channels
> > > > to different vhost-user frontends in the same QEMU process, the
> > > > same VFIOGroup could exist already.
> > > > 
> > > > This case could happen when e.g. there are two vhost accelerator
> > > > devices in the same VFIO group, and they are managed by the same
> > > > vhost-user backend process, and used to accelerate two different
> > > > virtio devices in QEMU. In this case, the same VFIO group fd could
> > > > be sent twice.
> > > > 
> > > > If we need to support this case, more changes in vfio/common.c
> > > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> > > > getting and putting a VFIOGroup structure multiple times,
> > > > 2) add a lock to make vfio_get_group*() and vfio_put_group()
> > > > thread-safe.
> > > 
> > > This is the sort of case where it seems like we're just grabbing
> > > internal interfaces and using them from other modules.  VFIOGroups have
> > > a device_list and currently handles multiple puts by testing whether
> > > that device list is empty (ie. we already have a refcnt effectively).
> > > Now we have a group user that has no devices, which is not something
> > > that it seems like we've designed or audited the code for handling.
> > > IOW, while the header file lives in a common location, it's not really
> > > intended to be an API within QEMU and it makes me a bit uncomfortable
> > > to use it as such.
> > > 
> > > > > Is that the goal?  It's not obvious in
> > > > > the code.  I don't really understand what vhost goes on to do with this
> > > > > group,  
> > > > 
> > > > The goal is that, we have a VFIO group opened by an external
> > > > vhost-user backend process, this process will manage the VFIO
> > > > device, but want QEMU to manage the VFIO group, including:
> > > > 
> > > > 1 - program the DMA mappings for this VFIO group based on
> > > >     the front-end device's dma_as in QEMU.
> > > > 
> > > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> > > > 
> > > > Vhost-user in QEMU as the vhost-user frontend just wants
> > > > hw/vfio to do above things after receiving a VFIO group fd
> > > > from the vhost-user backend process.
> > > > 
> > > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> > > > by calling vfio_get_group_from_fd() or put this VFIOGroup by
> > > > calling vfio_put_group() when it's not needed anymore, and
> > > > won't do anything else.
> > > > 
> > > > Vhost-user will only deal with the VFIOGroups allocated by
> > > > itself, and won't influence any other VFIOGroups used by the
> > > > VFIO passthru devices (-device vfio-pci). Because the same
> > > > VFIO group file can only be opened by one process via open().
> > > 
> > > But there's nothing here that guarantees the reverse.  vhost cannot
> > > open the group of an existing vfio-pci device, but vfio-pci can re-use
> > > the group that vhost has already opened.  This is again where it seems
> > > like we're trying to cherry pick from an internal API and leaving some
> > > loose ends dangling.
> > > 
> > > > > but I'm pretty sure the existing state machine in vfio isn't
> > > > > designed for it.  Thanks,  
> > > > 
> > > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> > > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> > > > is going to use VFIOGroup and how we are going to extend hw/vfio)
> > > > acceptable to you? Thanks!
> > > 
> > > I certainly would not want to duplicate managing the group, container,
> > > and MemoryListener, but I think we need a more formal interface with at
> > > least some notion of reference counting beyond the device list or
> > > explicit knowledge of an external user.
> > 
> > Got it! I'll try to address this. Thanks!
> > 
> > > I'm also curious if it really
> > > makes sense that the vhost backend is opening the group rather than
> > > letting QEMU do it.  It seems that vhost wants to deal with the device
> > > and we can never have symmetry in the scenario above, where a group
> > > might contain multiple devices and order matters because of where the
> > > group is opened.  If we still had a notion of a VFIODevice for an
> > > external user, then the device_list refcnt would just work.  Thanks,
> > 
> > Vhost-user backend will but QEMU won't open the VFIO
> > fds, because QEMU just has a vhost-user UNIX socket
> > path that it should connect to or listen on. So QEMU
> > doesn't know which group and device it should open.
> > The vhost accelerator devices are probed and managed
> > in the vhost-user backend process. And the vhost-user
> > backend process will bind the vhost-user instances
> > and vhost accelerator devices based on some sort of
> > user's input.
> > 
> > Best regards,
> > Tiwei Bie
> 
> Is the reason it's done like this because the
> backend is sharing the device with multiple QEMUs?

IMO, in the vhost-user based approach, it's natural
for QEMU users to just specify a vhost-user UNIX socket
path when launching the QEMU. And the vhost-user backend
behind this socket path can be implemented purely in
software or accelerated by hardware, which is transparent
to the QEMU users.

If we want an approach that QEMU users need to specify
the vhost accelerate device (similar to the case that
users specify the vhost-user socket path) when launching
the QEMU, we may not want to do it over vhost-user.

Best regards,
Tiwei Bie

> 
> I generally wonder how are restarts of the backend handled
> with this approach: closing the VFIO device tends to reset
> the whole device.
> 
> -- 
> MST
Tiwei Bie Sept. 12, 2018, 8:04 a.m. UTC | #12
On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
[...]
> 
> I generally wonder how are restarts of the backend handled
> with this approach: closing the VFIO device tends to reset
> the whole device.

Hi Michael,

I missed this comment previously.. This is a good point!
In this RFC, before sending the VFIO group fd to QEMU,
backend needs to close the VFIO device and unset the VFIO
container first. Otherwise, QEMU won't be able to set the
VFIO container for the VFIO group.

Another option is to share the container fd instead of
the group fd to QEMU. In this case, backend won't need
to close any fd. But there is one problem that, it's
hard to unmap the old mappings, especially when QEMU
crashes. Do you have any suggestions? Thanks!

Best regards,
Tiwei Bie
Michael S. Tsirkin Sept. 12, 2018, 4:14 p.m. UTC | #13
On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:
> On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> [...]
> > 
> > I generally wonder how are restarts of the backend handled
> > with this approach: closing the VFIO device tends to reset
> > the whole device.
> 
> Hi Michael,
> 
> I missed this comment previously.. This is a good point!
> In this RFC, before sending the VFIO group fd to QEMU,
> backend needs to close the VFIO device and unset the VFIO
> container first. Otherwise, QEMU won't be able to set the
> VFIO container for the VFIO group.
> 
> Another option is to share the container fd instead of
> the group fd to QEMU. In this case, backend won't need
> to close any fd. But there is one problem that, it's
> hard to unmap the old mappings, especially when QEMU
> crashes.

What are these old mappings and who creates them?
If you want to just reset everything the way it was
on open, surely it would be easy to add such a reset ioctl.

> Do you have any suggestions? Thanks!
> 
> Best regards,
> Tiwei Bie

Donnu. Alex, any thoughts? Which approach would you prefer?
Alex Williamson Sept. 12, 2018, 4:34 p.m. UTC | #14
On Wed, 12 Sep 2018 12:14:44 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:
> > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > [...]  
> > > 
> > > I generally wonder how are restarts of the backend handled
> > > with this approach: closing the VFIO device tends to reset
> > > the whole device.  
> > 
> > Hi Michael,
> > 
> > I missed this comment previously.. This is a good point!
> > In this RFC, before sending the VFIO group fd to QEMU,
> > backend needs to close the VFIO device and unset the VFIO
> > container first. Otherwise, QEMU won't be able to set the
> > VFIO container for the VFIO group.
> > 
> > Another option is to share the container fd instead of
> > the group fd to QEMU. In this case, backend won't need
> > to close any fd. But there is one problem that, it's
> > hard to unmap the old mappings, especially when QEMU
> > crashes.  
> 
> What are these old mappings and who creates them?
> If you want to just reset everything the way it was
> on open, surely it would be easy to add such a reset ioctl.
> 
> > Do you have any suggestions? Thanks!
> > 
> > Best regards,
> > Tiwei Bie  
> 
> Donnu. Alex, any thoughts? Which approach would you prefer?

The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
that an unmap does not bisect previous mappings, ie. a previous mapping
cannot be partially unmapped.  Therefore you can already dump the
entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
size = (u64)-1.  Thanks,

Alex
Michael S. Tsirkin Sept. 12, 2018, 4:44 p.m. UTC | #15
On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:
> On Wed, 12 Sep 2018 12:14:44 -0400
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:
> > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > > [...]  
> > > > 
> > > > I generally wonder how are restarts of the backend handled
> > > > with this approach: closing the VFIO device tends to reset
> > > > the whole device.  
> > > 
> > > Hi Michael,
> > > 
> > > I missed this comment previously.. This is a good point!
> > > In this RFC, before sending the VFIO group fd to QEMU,
> > > backend needs to close the VFIO device and unset the VFIO
> > > container first. Otherwise, QEMU won't be able to set the
> > > VFIO container for the VFIO group.
> > > 
> > > Another option is to share the container fd instead of
> > > the group fd to QEMU. In this case, backend won't need
> > > to close any fd. But there is one problem that, it's
> > > hard to unmap the old mappings, especially when QEMU
> > > crashes.  
> > 
> > What are these old mappings and who creates them?
> > If you want to just reset everything the way it was
> > on open, surely it would be easy to add such a reset ioctl.
> > 
> > > Do you have any suggestions? Thanks!
> > > 
> > > Best regards,
> > > Tiwei Bie  
> > 
> > Donnu. Alex, any thoughts? Which approach would you prefer?
> 
> The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
> that an unmap does not bisect previous mappings, ie. a previous mapping
> cannot be partially unmapped.  Therefore you can already dump the
> entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
> size = (u64)-1.  Thanks,
> 
> Alex

Hmm this would exclude the last byte (address (u64)-1).
VTD does not support such iova values for now but something
to keep in mind e.g. for virtio-iommu with nested virt
which does.
Alex Williamson Sept. 12, 2018, 5:15 p.m. UTC | #16
On Wed, 12 Sep 2018 12:44:15 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:
> > On Wed, 12 Sep 2018 12:14:44 -0400
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:  
> > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > > > [...]    
> > > > > 
> > > > > I generally wonder how are restarts of the backend handled
> > > > > with this approach: closing the VFIO device tends to reset
> > > > > the whole device.    
> > > > 
> > > > Hi Michael,
> > > > 
> > > > I missed this comment previously.. This is a good point!
> > > > In this RFC, before sending the VFIO group fd to QEMU,
> > > > backend needs to close the VFIO device and unset the VFIO
> > > > container first. Otherwise, QEMU won't be able to set the
> > > > VFIO container for the VFIO group.
> > > > 
> > > > Another option is to share the container fd instead of
> > > > the group fd to QEMU. In this case, backend won't need
> > > > to close any fd. But there is one problem that, it's
> > > > hard to unmap the old mappings, especially when QEMU
> > > > crashes.    
> > > 
> > > What are these old mappings and who creates them?
> > > If you want to just reset everything the way it was
> > > on open, surely it would be easy to add such a reset ioctl.
> > >   
> > > > Do you have any suggestions? Thanks!
> > > > 
> > > > Best regards,
> > > > Tiwei Bie    
> > > 
> > > Donnu. Alex, any thoughts? Which approach would you prefer?  
> > 
> > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
> > that an unmap does not bisect previous mappings, ie. a previous mapping
> > cannot be partially unmapped.  Therefore you can already dump the
> > entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
> > size = (u64)-1.  Thanks,
> > 
> > Alex  
> 
> Hmm this would exclude the last byte (address (u64)-1).
> VTD does not support such iova values for now but something
> to keep in mind e.g. for virtio-iommu with nested virt
> which does.

Yes, technically you'd need another ioctl to get the last byte, the
ioctls should have used start and end rather than start and size, but
it's too late to change.  Thanks,

Alex
Michael S. Tsirkin Sept. 12, 2018, 5:29 p.m. UTC | #17
On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote:
> On Wed, 12 Sep 2018 12:44:15 -0400
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:
> > > On Wed, 12 Sep 2018 12:14:44 -0400
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:  
> > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > > > > [...]    
> > > > > > 
> > > > > > I generally wonder how are restarts of the backend handled
> > > > > > with this approach: closing the VFIO device tends to reset
> > > > > > the whole device.    
> > > > > 
> > > > > Hi Michael,
> > > > > 
> > > > > I missed this comment previously.. This is a good point!
> > > > > In this RFC, before sending the VFIO group fd to QEMU,
> > > > > backend needs to close the VFIO device and unset the VFIO
> > > > > container first. Otherwise, QEMU won't be able to set the
> > > > > VFIO container for the VFIO group.
> > > > > 
> > > > > Another option is to share the container fd instead of
> > > > > the group fd to QEMU. In this case, backend won't need
> > > > > to close any fd. But there is one problem that, it's
> > > > > hard to unmap the old mappings, especially when QEMU
> > > > > crashes.    
> > > > 
> > > > What are these old mappings and who creates them?
> > > > If you want to just reset everything the way it was
> > > > on open, surely it would be easy to add such a reset ioctl.
> > > >   
> > > > > Do you have any suggestions? Thanks!
> > > > > 
> > > > > Best regards,
> > > > > Tiwei Bie    
> > > > 
> > > > Donnu. Alex, any thoughts? Which approach would you prefer?  
> > > 
> > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
> > > that an unmap does not bisect previous mappings, ie. a previous mapping
> > > cannot be partially unmapped.  Therefore you can already dump the
> > > entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
> > > size = (u64)-1.  Thanks,
> > > 
> > > Alex  
> > 
> > Hmm this would exclude the last byte (address (u64)-1).
> > VTD does not support such iova values for now but something
> > to keep in mind e.g. for virtio-iommu with nested virt
> > which does.
> 
> Yes, technically you'd need another ioctl to get the last byte,

It will violate the requirement of not unmapping part of a region,
won't it? So it won't work.

> the
> ioctls should have used start and end rather than start and size, but
> it's too late to change.  Thanks,
> 
> Alex

At some point we'll have to fix above issue in some way. Maybe when we
add support for atomic remaps.
Alex Williamson Sept. 12, 2018, 6:09 p.m. UTC | #18
On Wed, 12 Sep 2018 13:29:33 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote:
> > On Wed, 12 Sep 2018 12:44:15 -0400
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:  
> > > > On Wed, 12 Sep 2018 12:14:44 -0400
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > >     
> > > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:    
> > > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > > > > > [...]      
> > > > > > > 
> > > > > > > I generally wonder how are restarts of the backend handled
> > > > > > > with this approach: closing the VFIO device tends to reset
> > > > > > > the whole device.      
> > > > > > 
> > > > > > Hi Michael,
> > > > > > 
> > > > > > I missed this comment previously.. This is a good point!
> > > > > > In this RFC, before sending the VFIO group fd to QEMU,
> > > > > > backend needs to close the VFIO device and unset the VFIO
> > > > > > container first. Otherwise, QEMU won't be able to set the
> > > > > > VFIO container for the VFIO group.
> > > > > > 
> > > > > > Another option is to share the container fd instead of
> > > > > > the group fd to QEMU. In this case, backend won't need
> > > > > > to close any fd. But there is one problem that, it's
> > > > > > hard to unmap the old mappings, especially when QEMU
> > > > > > crashes.      
> > > > > 
> > > > > What are these old mappings and who creates them?
> > > > > If you want to just reset everything the way it was
> > > > > on open, surely it would be easy to add such a reset ioctl.
> > > > >     
> > > > > > Do you have any suggestions? Thanks!
> > > > > > 
> > > > > > Best regards,
> > > > > > Tiwei Bie      
> > > > > 
> > > > > Donnu. Alex, any thoughts? Which approach would you prefer?    
> > > > 
> > > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
> > > > that an unmap does not bisect previous mappings, ie. a previous mapping
> > > > cannot be partially unmapped.  Therefore you can already dump the
> > > > entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
> > > > size = (u64)-1.  Thanks,
> > > > 
> > > > Alex    
> > > 
> > > Hmm this would exclude the last byte (address (u64)-1).
> > > VTD does not support such iova values for now but something
> > > to keep in mind e.g. for virtio-iommu with nested virt
> > > which does.  
> > 
> > Yes, technically you'd need another ioctl to get the last byte,  
> 
> It will violate the requirement of not unmapping part of a region,
> won't it? So it won't work.

Yes, in theory it's more complicated, but in practice it's not because
Intel only supports up to 48-bits of IOVA space (modulo round down by
a page, not a byte).  It would also be deterministic if we could ever
get past the RMRR issues with IGD to implement an IOVA list as every
IOMMU (AFAIK) has some reserved ranges that cannot be mapped and would
bisect the IOVA space somewhere.
 
> > the
> > ioctls should have used start and end rather than start and size, but
> > it's too late to change.  Thanks,
> > 
> > Alex  
> 
> At some point we'll have to fix above issue in some way. Maybe when we
> add support for atomic remaps.

"Patches welcome".  Thanks,

Alex
Tian, Kevin Sept. 13, 2018, 5:26 a.m. UTC | #19
> From: Alex Williamson
> Sent: Thursday, September 13, 2018 2:10 AM
> 
> On Wed, 12 Sep 2018 13:29:33 -0400
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote:
> > > On Wed, 12 Sep 2018 12:44:15 -0400
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >
> > > > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:
> > > > > On Wed, 12 Sep 2018 12:14:44 -0400
> > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > >
> > > > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:
> > > > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin
> wrote:
> > > > > > > [...]
> > > > > > > >
> > > > > > > > I generally wonder how are restarts of the backend handled
> > > > > > > > with this approach: closing the VFIO device tends to reset
> > > > > > > > the whole device.
> > > > > > >
> > > > > > > Hi Michael,
> > > > > > >
> > > > > > > I missed this comment previously.. This is a good point!
> > > > > > > In this RFC, before sending the VFIO group fd to QEMU,
> > > > > > > backend needs to close the VFIO device and unset the VFIO
> > > > > > > container first. Otherwise, QEMU won't be able to set the
> > > > > > > VFIO container for the VFIO group.
> > > > > > >
> > > > > > > Another option is to share the container fd instead of
> > > > > > > the group fd to QEMU. In this case, backend won't need
> > > > > > > to close any fd. But there is one problem that, it's
> > > > > > > hard to unmap the old mappings, especially when QEMU
> > > > > > > crashes.
> > > > > >
> > > > > > What are these old mappings and who creates them?
> > > > > > If you want to just reset everything the way it was
> > > > > > on open, surely it would be easy to add such a reset ioctl.
> > > > > >
> > > > > > > Do you have any suggestions? Thanks!
> > > > > > >
> > > > > > > Best regards,
> > > > > > > Tiwei Bie
> > > > > >
> > > > > > Donnu. Alex, any thoughts? Which approach would you prefer?
> > > > >
> > > > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only
> requires
> > > > > that an unmap does not bisect previous mappings, ie. a previous
> mapping
> > > > > cannot be partially unmapped.  Therefore you can already dump the
> > > > > entire IOVA space for a container with one UNMAP_DMA call, iova =
> 0,
> > > > > size = (u64)-1.  Thanks,
> > > > >
> > > > > Alex
> > > >
> > > > Hmm this would exclude the last byte (address (u64)-1).
> > > > VTD does not support such iova values for now but something
> > > > to keep in mind e.g. for virtio-iommu with nested virt
> > > > which does.
> > >
> > > Yes, technically you'd need another ioctl to get the last byte,
> >
> > It will violate the requirement of not unmapping part of a region,
> > won't it? So it won't work.
> 
> Yes, in theory it's more complicated, but in practice it's not because
> Intel only supports up to 48-bits of IOVA space (modulo round down by
> a page, not a byte).  It would also be deterministic if we could ever
> get past the RMRR issues with IGD to implement an IOVA list as every
> IOMMU (AFAIK) has some reserved ranges that cannot be mapped and
> would
> bisect the IOVA space somewhere.
> 

One correction - Intel VT-d supports 5-level paging now, which gives
57-bits IOVA space. though this comment doesn't affect your
discussion... :-)

Thanks
Kevin
diff mbox series

Patch

diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
index f59667f498..a57a8f9451 100644
--- a/docs/interop/vhost-user.txt
+++ b/docs/interop/vhost-user.txt
@@ -397,6 +397,7 @@  Protocol features
 #define VHOST_USER_PROTOCOL_F_CONFIG         9
 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
+#define VHOST_USER_PROTOCOL_F_VFIO_GROUP     12
 
 Master message types
 --------------------
@@ -815,6 +816,21 @@  Slave message types
       This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
       protocol feature has been successfully negotiated.
 
+ * VHOST_USER_SLAVE_VFIO_GROUP_MSG
+
+      Id: 4
+      Equivalent ioctl: N/A
+      Slave payload: N/A
+      Master payload: N/A
+
+      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
+      could send this request to share its VFIO group fd via ancillary data
+      to master. After receiving this request from slave, master will close
+      the existing VFIO group if any and do the DMA programming based on the
+      virtio device's DMA address space for the new group if the request is
+      sent with a file descriptor.
+
+
 VHOST_USER_PROTOCOL_F_REPLY_ACK:
 -------------------------------
 The original vhost-user specification only demands replies for certain
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index b041343632..db958e24c7 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -52,6 +52,7 @@  enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_CONFIG = 9,
     VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
     VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+    VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12,
     VHOST_USER_PROTOCOL_F_MAX
 };
 
@@ -97,6 +98,7 @@  typedef enum VhostUserSlaveRequest {
     VHOST_USER_SLAVE_IOTLB_MSG = 1,
     VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
     VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+    VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4,
     VHOST_USER_SLAVE_MAX
 }  VhostUserSlaveRequest;
 
@@ -949,6 +951,41 @@  static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
     return 0;
 }
 
+static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
+                                              int *fd)
+{
+    struct vhost_user *u = dev->opaque;
+    VhostUserState *user = u->user;
+    VirtIODevice *vdev = dev->vdev;
+    int groupfd = fd[0];
+    VFIOGroup *group;
+
+    if (!virtio_has_feature(dev->protocol_features,
+                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
+        vdev == NULL) {
+        return -1;
+    }
+
+    if (user->vfio_group) {
+        vfio_put_group(user->vfio_group);
+        user->vfio_group = NULL;
+    }
+
+    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
+    if (group == NULL) {
+        return -1;
+    }
+
+    if (group->fd != groupfd) {
+        close(groupfd);
+    }
+
+    user->vfio_group = group;
+    fd[0] = -1;
+
+    return 0;
+}
+
 static void slave_read(void *opaque)
 {
     struct vhost_dev *dev = opaque;
@@ -1021,6 +1058,9 @@  static void slave_read(void *opaque)
         ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
                                                           fd[0]);
         break;
+    case VHOST_USER_SLAVE_VFIO_GROUP_MSG:
+        ret = vhost_user_slave_handle_vfio_group(dev, fd);
+        break;
     default:
         error_report("Received unexpected msg type.");
         ret = -EINVAL;
diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
index fd660393a0..9e11473274 100644
--- a/include/hw/virtio/vhost-user.h
+++ b/include/hw/virtio/vhost-user.h
@@ -10,6 +10,7 @@ 
 
 #include "chardev/char-fe.h"
 #include "hw/virtio/virtio.h"
+#include "hw/vfio/vfio-common.h"
 
 typedef struct VhostUserHostNotifier {
     MemoryRegion mr;
@@ -20,6 +21,7 @@  typedef struct VhostUserHostNotifier {
 typedef struct VhostUserState {
     CharBackend *chr;
     VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
+    VFIOGroup *vfio_group;
 } VhostUserState;
 
 VhostUserState *vhost_user_init(void);