diff mbox series

[RFC,v3,3/3] vhost: Allocate memory for packed vring

Message ID 20240802112138.46831-4-sahilcdq@proton.me
State New
Headers show
Series Add packed virtqueue to shadow virtqueue | expand

Commit Message

Sahil Aug. 2, 2024, 11:21 a.m. UTC
Allocate memory for the packed vq format and support
packed vq in the SVQ "start" and "stop" operations.

Signed-off-by: Sahil Siddiq <sahilcdq@proton.me>
---
Changes v2 -> v3:
* vhost-shadow-virtqueue.c
  (vhost_svq_memory_packed): New function
  (vhost_svq_start):
  - Remove common variables out of if-else branch.
  (vhost_svq_stop):
  - Add support for packed vq.
  (vhost_svq_get_vring_addr): Revert changes
  (vhost_svq_get_vring_addr_packed): Likwise.
* vhost-shadow-virtqueue.h
  - Revert changes made to "vhost_svq_get_vring_addr*"
    functions.
* vhost-vdpa.c: Revert changes.

 hw/virtio/vhost-shadow-virtqueue.c | 56 +++++++++++++++++++++++-------
 hw/virtio/vhost-shadow-virtqueue.h |  4 +++
 2 files changed, 47 insertions(+), 13 deletions(-)

Comments

Eugenio Perez Martin Aug. 7, 2024, 4:22 p.m. UTC | #1
On Fri, Aug 2, 2024 at 1:22 PM Sahil Siddiq <icegambit91@gmail.com> wrote:
>
> Allocate memory for the packed vq format and support
> packed vq in the SVQ "start" and "stop" operations.
>
> Signed-off-by: Sahil Siddiq <sahilcdq@proton.me>
> ---
> Changes v2 -> v3:
> * vhost-shadow-virtqueue.c
>   (vhost_svq_memory_packed): New function
>   (vhost_svq_start):
>   - Remove common variables out of if-else branch.
>   (vhost_svq_stop):
>   - Add support for packed vq.
>   (vhost_svq_get_vring_addr): Revert changes
>   (vhost_svq_get_vring_addr_packed): Likwise.
> * vhost-shadow-virtqueue.h
>   - Revert changes made to "vhost_svq_get_vring_addr*"
>     functions.
> * vhost-vdpa.c: Revert changes.
>
>  hw/virtio/vhost-shadow-virtqueue.c | 56 +++++++++++++++++++++++-------
>  hw/virtio/vhost-shadow-virtqueue.h |  4 +++
>  2 files changed, 47 insertions(+), 13 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> index 4c308ee53d..f4285db2b4 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -645,6 +645,8 @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
>
>  /**
>   * Get the shadow vq vring address.
> + * This is used irrespective of whether the
> + * split or packed vq format is used.
>   * @svq: Shadow virtqueue
>   * @addr: Destination to store address
>   */
> @@ -672,6 +674,16 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
>      return ROUND_UP(used_size, qemu_real_host_page_size());
>  }
>
> +size_t vhost_svq_memory_packed(const VhostShadowVirtqueue *svq)
> +{
> +    size_t desc_size = sizeof(struct vring_packed_desc) * svq->num_free;
> +    size_t driver_event_suppression = sizeof(struct vring_packed_desc_event);
> +    size_t device_event_suppression = sizeof(struct vring_packed_desc_event);
> +
> +    return ROUND_UP(desc_size + driver_event_suppression + device_event_suppression,
> +                    qemu_real_host_page_size());
> +}
> +
>  /**
>   * Set a new file descriptor for the guest to kick the SVQ and notify for avail
>   *
> @@ -726,17 +738,30 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
>
>      svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
>      svq->num_free = svq->vring.num;
> -    svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> -                           -1, 0);
> -    desc_size = sizeof(vring_desc_t) * svq->vring.num;
> -    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> -    svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> -                           -1, 0);
> -    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
> -    svq->desc_next = g_new0(uint16_t, svq->vring.num);
> -    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> +    svq->is_packed = virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED);
> +
> +    if (virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED)) {
> +        svq->vring_packed.vring.desc = mmap(NULL, vhost_svq_memory_packed(svq),
> +                                            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> +                                            -1, 0);
> +        desc_size = sizeof(struct vring_packed_desc) * svq->vring.num;
> +        svq->vring_packed.vring.driver = (void *)((char *)svq->vring_packed.vring.desc + desc_size);
> +        svq->vring_packed.vring.device = (void *)((char *)svq->vring_packed.vring.driver +
> +                                                  sizeof(struct vring_packed_desc_event));

This is a great start but it will be problematic when you start
mapping the areas to the vdpa device. The driver area should be read
only for the device, but it is placed in the same page as a RW one.

More on this later.

> +    } else {
> +        svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> +                               PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> +                               -1, 0);
> +        desc_size = sizeof(vring_desc_t) * svq->vring.num;
> +        svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> +        svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> +                               PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> +                               -1, 0);
> +    }

I think it will be beneficial to avoid "if (packed)" conditionals on
the exposed functions that give information about the memory maps.
These need to be replicated at
hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings.

However, the current one depends on the driver area to live in the
same page as the descriptor area, so it is not suitable for this.

So what about this action plan:
1) Make the avail ring (or driver area) independent of the descriptor ring.
2) Return the mapping permissions of the descriptor area (not needed
here, but needed for hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings

> +
> +    svq->desc_state = g_new0(SVQDescState, svq->num_free);
> +    svq->desc_next = g_new0(uint16_t, svq->num_free);
> +    for (unsigned i = 0; i < svq->num_free - 1; i++) {
>          svq->desc_next[i] = cpu_to_le16(i + 1);
>      }
>  }
> @@ -776,8 +801,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
>      svq->vq = NULL;
>      g_free(svq->desc_next);
>      g_free(svq->desc_state);
> -    munmap(svq->vring.desc, vhost_svq_driver_area_size(svq));
> -    munmap(svq->vring.used, vhost_svq_device_area_size(svq));
> +
> +    if (svq->is_packed) {
> +        munmap(svq->vring_packed.vring.desc, vhost_svq_memory_packed(svq));
> +    } else {
> +        munmap(svq->vring.desc, vhost_svq_driver_area_size(svq));
> +        munmap(svq->vring.used, vhost_svq_device_area_size(svq));
> +    }
>      event_notifier_set_handler(&svq->hdev_call, NULL);
>  }
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> index ee1a87f523..03b722a186 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.h
> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> @@ -67,6 +67,9 @@ struct vring_packed {
>
>  /* Shadow virtqueue to relay notifications */
>  typedef struct VhostShadowVirtqueue {
> +    /* True if packed virtqueue */
> +    bool is_packed;
> +
>      /* Virtio queue shadowing */
>      VirtQueue *vq;
>
> @@ -150,6 +153,7 @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
>                                struct vhost_vring_addr *addr);
>  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
>  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> +size_t vhost_svq_memory_packed(const VhostShadowVirtqueue *svq);
>
>  void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
>                       VirtQueue *vq, VhostIOVATree *iova_tree);
> --
> 2.45.2
>
Sahil Aug. 11, 2024, 3:37 p.m. UTC | #2
Hi,

Thank you for your reply.

On Wednesday, August 7, 2024 10:11:52 PM GMT+5:30 you wrote:
> On Fri, Aug 2, 2024 at 1:22 PM Sahil Siddiq <icegambit91@gmail.com> wrote:
> [...]
> > I'll also test these changes out by following the
> > suggestions given in response to v1. I'll have more
> > confidence once I know these changes work.
> 
> Please let me know if you need help with the testing!
> 

Sure, I'll let you know if I run into any issues.

On Wednesday, August 7, 2024 10:10:51 PM GMT+5:30 you wrote:
> On Fri, Aug 2, 2024 at 1:22 PM Sahil Siddiq <icegambit91@gmail.com> wrote:
> [...]
> As a suggestion, we can split it into:
> 1) Refactor in vhost_svq_translate_addr to support out_num+in_num. No
> functional change.
> 2) Refactor vhost_svq_add_split to extract common code into
> vhost_svq_add. No functional change.
> 3) Adding packed code.
> 
> How to split or merge the patches is not a well-defined thing, so I'm
> happy with this patch if you think the reactor is not worth it.

I think your suggestion will make the changeset neater. If feasible,
I'll refactor it after testing my changes and clearing up the page
mapping issue.

Thanks,
Sahil
Sahil Aug. 11, 2024, 5:20 p.m. UTC | #3
Hi,

On Wednesday, August 7, 2024 9:52:10 PM GMT+5:30 Eugenio Perez Martin wrote:
> On Fri, Aug 2, 2024 at 1:22 PM Sahil Siddiq <icegambit91@gmail.com> wrote:
> > [...]
> > @@ -726,17 +738,30 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> >      svq->vring.num = virtio_queue_get_num(vdev,
> >      virtio_get_queue_index(vq));
> >      svq->num_free = svq->vring.num;
> > 
> > -    svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> > -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > -                           -1, 0);
> > -    desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > -    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > -    svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> > -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > -                           -1, 0);
> > -    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
> > -    svq->desc_next = g_new0(uint16_t, svq->vring.num);
> > -    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> > +    svq->is_packed = virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED);
> > +
> > +    if (virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED)) {
> > +        svq->vring_packed.vring.desc = mmap(NULL, vhost_svq_memory_packed(svq),
> > +                                          PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > +                                          -1, 0);
> > +        desc_size = sizeof(struct vring_packed_desc) * svq->vring.num;
> > +        svq->vring_packed.vring.driver = (void *)((char *)svq->vring_packed.vring.desc + desc_size);
> > +        svq->vring_packed.vring.device = (void *)((char *)svq->vring_packed.vring.driver +
> > +                                     sizeof(struct vring_packed_desc_event));
>
> This is a great start but it will be problematic when you start
> mapping the areas to the vdpa device. The driver area should be read
> only for the device, but it is placed in the same page as a RW one.
>
> More on this later.
> 
> > +    } else {
> > +        svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> > +                               PROT_READ | PROT_WRITE, MAP_SHARED |MAP_ANONYMOUS,
> > +                               -1, 0);
> > +        desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > +        svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > +        svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> > +                               PROT_READ | PROT_WRITE, MAP_SHARED |MAP_ANONYMOUS,
> > +                               -1, 0);
> > +    }
> 
> I think it will be beneficial to avoid "if (packed)" conditionals on
> the exposed functions that give information about the memory maps.
> These need to be replicated at
> hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings.
> 
> However, the current one depends on the driver area to live in the
> same page as the descriptor area, so it is not suitable for this.

I haven't really understood this.

In split vqs the descriptor, driver and device areas are mapped to RW pages.
In vhost_vdpa.c:vhost_vdpa_svq_map_rings, the regions are mapped with
the appropriate "perm" field that sets the R/W permissions in the DMAMap
object. Is this problematic for the split vq format because the avail ring is
anyway mapped to a RW page in "vhost_svq_start"?

For packed vqs, the "Driver Event Suppression" data structure should be
read-only for the device. Similar to split vqs, this is mapped to a RW page
in "vhost_svq_start" but it is then mapped to a DMAMap object with read-
only perms in "vhost_vdpa_svq_map_rings".

I am a little confused about where the issue lies.

Thanks,
Sahil
Eugenio Perez Martin Aug. 12, 2024, 6:31 a.m. UTC | #4
On Sun, Aug 11, 2024 at 7:20 PM Sahil <icegambit91@gmail.com> wrote:
>
> Hi,
>
> On Wednesday, August 7, 2024 9:52:10 PM GMT+5:30 Eugenio Perez Martin wrote:
> > On Fri, Aug 2, 2024 at 1:22 PM Sahil Siddiq <icegambit91@gmail.com> wrote:
> > > [...]
> > > @@ -726,17 +738,30 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> > >      svq->vring.num = virtio_queue_get_num(vdev,
> > >      virtio_get_queue_index(vq));
> > >      svq->num_free = svq->vring.num;
> > >
> > > -    svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> > > -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > -                           -1, 0);
> > > -    desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > > -    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > > -    svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> > > -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > -                           -1, 0);
> > > -    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
> > > -    svq->desc_next = g_new0(uint16_t, svq->vring.num);
> > > -    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> > > +    svq->is_packed = virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED);
> > > +
> > > +    if (virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED)) {
> > > +        svq->vring_packed.vring.desc = mmap(NULL, vhost_svq_memory_packed(svq),
> > > +                                          PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > +                                          -1, 0);
> > > +        desc_size = sizeof(struct vring_packed_desc) * svq->vring.num;
> > > +        svq->vring_packed.vring.driver = (void *)((char *)svq->vring_packed.vring.desc + desc_size);
> > > +        svq->vring_packed.vring.device = (void *)((char *)svq->vring_packed.vring.driver +
> > > +                                     sizeof(struct vring_packed_desc_event));
> >
> > This is a great start but it will be problematic when you start
> > mapping the areas to the vdpa device. The driver area should be read
> > only for the device, but it is placed in the same page as a RW one.
> >
> > More on this later.
> >
> > > +    } else {
> > > +        svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> > > +                               PROT_READ | PROT_WRITE, MAP_SHARED |MAP_ANONYMOUS,
> > > +                               -1, 0);
> > > +        desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > > +        svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > > +        svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> > > +                               PROT_READ | PROT_WRITE, MAP_SHARED |MAP_ANONYMOUS,
> > > +                               -1, 0);
> > > +    }
> >
> > I think it will be beneficial to avoid "if (packed)" conditionals on
> > the exposed functions that give information about the memory maps.
> > These need to be replicated at
> > hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings.
> >
> > However, the current one depends on the driver area to live in the
> > same page as the descriptor area, so it is not suitable for this.
>
> I haven't really understood this.
>
> In split vqs the descriptor, driver and device areas are mapped to RW pages.
> In vhost_vdpa.c:vhost_vdpa_svq_map_rings, the regions are mapped with
> the appropriate "perm" field that sets the R/W permissions in the DMAMap
> object. Is this problematic for the split vq format because the avail ring is
> anyway mapped to a RW page in "vhost_svq_start"?
>

Ok so maybe the map word is misleading here. The pages needs to be
allocated for the QEMU process with both PROT_READ | PROT_WRITE, as
QEMU needs to write into it.

They are mapped to the device with vhost_vdpa_dma_map, and the last
bool parameter indicates if the device needs write permissions or not.
You can see how hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_ring checks
the needle permission for this, and the needle permissions are stored
at hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings. This is the
function that needs to check for the maps permissions.

> For packed vqs, the "Driver Event Suppression" data structure should be
> read-only for the device. Similar to split vqs, this is mapped to a RW page
> in "vhost_svq_start" but it is then mapped to a DMAMap object with read-
> only perms in "vhost_vdpa_svq_map_rings".
>
> I am a little confused about where the issue lies.
>
> Thanks,
> Sahil
>
>
Sahil Aug. 12, 2024, 7:32 p.m. UTC | #5
Hi,

On Monday, August 12, 2024 12:01:00 PM GMT+5:30 you wrote:
> On Sun, Aug 11, 2024 at 7:20 PM Sahil <icegambit91@gmail.com> wrote:
> > On Wednesday, August 7, 2024 9:52:10 PM GMT+5:30 Eugenio Perez Martin wrote:
> > > On Fri, Aug 2, 2024 at 1:22 PM Sahil Siddiq <icegambit91@gmail.com> wrote:
> > > > [...]
> > > > @@ -726,17 +738,30 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> > > >      svq->vring.num = virtio_queue_get_num(vdev,
> > > >      virtio_get_queue_index(vq));
> > > >      svq->num_free = svq->vring.num;
> > > >
> > > > -    svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> > > > -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > > -                           -1, 0);
> > > > -    desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > > > -    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > > > -    svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> > > > -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > > -                           -1, 0);
> > > > -    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
> > > > -    svq->desc_next = g_new0(uint16_t, svq->vring.num);
> > > > -    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> > > > +    svq->is_packed = virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED);
> > > > +
> > > > +    if (virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED)) {
> > > > +        svq->vring_packed.vring.desc = mmap(NULL, vhost_svq_memory_packed(svq),
> > > > +                                          PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > > +                                          -1, 0);
> > > > +        desc_size = sizeof(struct vring_packed_desc) * svq->vring.num;
> > > > +        svq->vring_packed.vring.driver = (void *)((char *)svq->vring_packed.vring.desc + desc_size);
> > > > +        svq->vring_packed.vring.device = (void *)((char *)svq->vring_packed.vring.driver +
> > > > +                                     sizeof(struct vring_packed_desc_event));
> > >
> > > This is a great start but it will be problematic when you start
> > > mapping the areas to the vdpa device. The driver area should be read
> > > only for the device, but it is placed in the same page as a RW one.
> > >
> > > More on this later.
> > >
> > > > +    } else {
> > > > +        svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> > > > +                               PROT_READ | PROT_WRITE, MAP_SHARED |MAP_ANONYMOUS,
> > > > +                               -1, 0);
> > > > +        desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > > > +        svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > > > +        svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> > > > +                               PROT_READ | PROT_WRITE, MAP_SHARED |MAP_ANONYMOUS,
> > > > +                               -1, 0);
> > > > +    }
> > >
> > > I think it will be beneficial to avoid "if (packed)" conditionals on
> > > the exposed functions that give information about the memory maps.
> > > These need to be replicated at
> > > hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings.
> > >
> > > However, the current one depends on the driver area to live in the
> > > same page as the descriptor area, so it is not suitable for this.
> >
> > I haven't really understood this.
> >
> > In split vqs the descriptor, driver and device areas are mapped to RW pages.
> > In vhost_vdpa.c:vhost_vdpa_svq_map_rings, the regions are mapped with
> > the appropriate "perm" field that sets the R/W permissions in the DMAMap
> > object. Is this problematic for the split vq format because the avail ring is
> > anyway mapped to a RW page in "vhost_svq_start"?
> >
> 
> Ok so maybe the map word is misleading here. The pages needs to be
> allocated for the QEMU process with both PROT_READ | PROT_WRITE, as
> QEMU needs to write into it.
> 
> They are mapped to the device with vhost_vdpa_dma_map, and the last
> bool parameter indicates if the device needs write permissions or not.
> You can see how hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_ring checks
> the needle permission for this, and the needle permissions are stored
> at hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings. This is the
> function that needs to check for the maps permissions.
>

I think I have understood what's going on in "vhost_vdpa_svq_map_rings",
"vhost_vdpa_svq_map_ring" and "vhost_vdpa_dma_map". But based on
what I have understood it looks like the driver area is getting mapped to
an iova which is read-only for vhost_vdpa. Please let me know where I am
going wrong.

Consider the following implementation in hw/virtio/vhost_vdpa.c:

> size_t device_size = vhost_svq_device_area_size(svq);
> size_t driver_size = vhost_svq_driver_area_size(svq);

The driver size includes the descriptor area and the driver area. For
packed vq, the driver area is the "driver event suppression" structure
which should be read-only for the device according to the virtio spec
(section 2.8.10) [1].

> size_t avail_offset;
> bool ok;
> 
> vhost_svq_get_vring_addr(svq, &svq_addr);

Over here "svq_addr.desc_user_addr" will point to the descriptor area
while "svq_addr.avail_user_addr" will point to the driver area/driver
event suppression structure.

> driver_region = (DMAMap) {
>     .translated_addr = svq_addr.desc_user_addr,
>     .size = driver_size - 1,
>     .perm = IOMMU_RO,
> };

This region points to the descriptor area and its size encompasses the
driver area as well with RO permission.

> ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);

The above function checks the value of needle->perm and sees that it is RO.
It then calls "vhost_vdpa_dma_map" with the following arguments:

> r = vhost_vdpa_dma_map(v->shared, v->address_space_id, needle->iova,
>                                                needle->size + 1,
>                                                (void *)(uintptr_t)needle->translated_addr,
>                                                needle->perm == IOMMU_RO);

Since needle->size includes the driver area as well, the driver area will be
mapped to a RO page in the device's address space, right?

> if (unlikely(!ok)) {
>     error_prepend(errp, "Cannot create vq driver region: ");
>     return false;
> }
> addr->desc_user_addr = driver_region.iova;
> avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
> addr->avail_user_addr = driver_region.iova + avail_offset;

I think "addr->desc_user_addr" and "addr->avail_user_addr" will both be
mapped to a RO page in the device's address space.

> device_region = (DMAMap) {
>     .translated_addr = svq_addr.used_user_addr,
>     .size = device_size - 1,
>     .perm = IOMMU_RW,
> };

The device area/device event suppression structure on the other hand will
be mapped to a RW page.

I also think there are other issues with the current state of the patch. According
to the virtio spec (section 2.8.10) [1], the "device event suppression" structure
needs to be write-only for the device but is mapped to a RW page.

Another concern I have is regarding the driver area size for packed vq. In
"hw/virtio/vhost-shadow-virtqueue.c" of the current patch:
> size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
> {
>     size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
>     size_t avail_size = offsetof(vring_avail_t, ring[svq->vring.num]) +
>                                                               sizeof(uint16_t);
> 
>     return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size());
> }
> 
> [...]
> 
> size_t vhost_svq_memory_packed(const VhostShadowVirtqueue *svq)
> {
>     size_t desc_size = sizeof(struct vring_packed_desc) * svq->num_free;
>     size_t driver_event_suppression = sizeof(struct vring_packed_desc_event);
>     size_t device_event_suppression = sizeof(struct vring_packed_desc_event);
> 
>     return ROUND_UP(desc_size + driver_event_suppression + device_event_suppression,
>                     qemu_real_host_page_size());
> }

The size returned by "vhost_svq_driver_area_size" might not be the actual driver
size which is given by desc_size + driver_event_suppression, right? Will this have to
be changed too?

Thanks,
Sahil

[1] https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-720008
Eugenio Perez Martin Aug. 13, 2024, 6:53 a.m. UTC | #6
On Mon, Aug 12, 2024 at 9:32 PM Sahil <icegambit91@gmail.com> wrote:
>
> Hi,
>
> On Monday, August 12, 2024 12:01:00 PM GMT+5:30 you wrote:
> > On Sun, Aug 11, 2024 at 7:20 PM Sahil <icegambit91@gmail.com> wrote:
> > > On Wednesday, August 7, 2024 9:52:10 PM GMT+5:30 Eugenio Perez Martin wrote:
> > > > On Fri, Aug 2, 2024 at 1:22 PM Sahil Siddiq <icegambit91@gmail.com> wrote:
> > > > > [...]
> > > > > @@ -726,17 +738,30 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> > > > >      svq->vring.num = virtio_queue_get_num(vdev,
> > > > >      virtio_get_queue_index(vq));
> > > > >      svq->num_free = svq->vring.num;
> > > > >
> > > > > -    svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> > > > > -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > > > -                           -1, 0);
> > > > > -    desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > > > > -    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > > > > -    svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> > > > > -                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > > > -                           -1, 0);
> > > > > -    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
> > > > > -    svq->desc_next = g_new0(uint16_t, svq->vring.num);
> > > > > -    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> > > > > +    svq->is_packed = virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED);
> > > > > +
> > > > > +    if (virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED)) {
> > > > > +        svq->vring_packed.vring.desc = mmap(NULL, vhost_svq_memory_packed(svq),
> > > > > +                                          PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
> > > > > +                                          -1, 0);
> > > > > +        desc_size = sizeof(struct vring_packed_desc) * svq->vring.num;
> > > > > +        svq->vring_packed.vring.driver = (void *)((char *)svq->vring_packed.vring.desc + desc_size);
> > > > > +        svq->vring_packed.vring.device = (void *)((char *)svq->vring_packed.vring.driver +
> > > > > +                                     sizeof(struct vring_packed_desc_event));
> > > >
> > > > This is a great start but it will be problematic when you start
> > > > mapping the areas to the vdpa device. The driver area should be read
> > > > only for the device, but it is placed in the same page as a RW one.
> > > >
> > > > More on this later.
> > > >
> > > > > +    } else {
> > > > > +        svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
> > > > > +                               PROT_READ | PROT_WRITE, MAP_SHARED |MAP_ANONYMOUS,
> > > > > +                               -1, 0);
> > > > > +        desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > > > > +        svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > > > > +        svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
> > > > > +                               PROT_READ | PROT_WRITE, MAP_SHARED |MAP_ANONYMOUS,
> > > > > +                               -1, 0);
> > > > > +    }
> > > >
> > > > I think it will be beneficial to avoid "if (packed)" conditionals on
> > > > the exposed functions that give information about the memory maps.
> > > > These need to be replicated at
> > > > hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings.
> > > >
> > > > However, the current one depends on the driver area to live in the
> > > > same page as the descriptor area, so it is not suitable for this.
> > >
> > > I haven't really understood this.
> > >
> > > In split vqs the descriptor, driver and device areas are mapped to RW pages.
> > > In vhost_vdpa.c:vhost_vdpa_svq_map_rings, the regions are mapped with
> > > the appropriate "perm" field that sets the R/W permissions in the DMAMap
> > > object. Is this problematic for the split vq format because the avail ring is
> > > anyway mapped to a RW page in "vhost_svq_start"?
> > >
> >
> > Ok so maybe the map word is misleading here. The pages needs to be
> > allocated for the QEMU process with both PROT_READ | PROT_WRITE, as
> > QEMU needs to write into it.
> >
> > They are mapped to the device with vhost_vdpa_dma_map, and the last
> > bool parameter indicates if the device needs write permissions or not.
> > You can see how hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_ring checks
> > the needle permission for this, and the needle permissions are stored
> > at hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_map_rings. This is the
> > function that needs to check for the maps permissions.
> >
>
> I think I have understood what's going on in "vhost_vdpa_svq_map_rings",
> "vhost_vdpa_svq_map_ring" and "vhost_vdpa_dma_map". But based on
> what I have understood it looks like the driver area is getting mapped to
> an iova which is read-only for vhost_vdpa. Please let me know where I am
> going wrong.
>

You're not going wrong there. The device does not need to write into
this area, so we map it read only.

> Consider the following implementation in hw/virtio/vhost_vdpa.c:
>
> > size_t device_size = vhost_svq_device_area_size(svq);
> > size_t driver_size = vhost_svq_driver_area_size(svq);
>
> The driver size includes the descriptor area and the driver area. For
> packed vq, the driver area is the "driver event suppression" structure
> which should be read-only for the device according to the virtio spec
> (section 2.8.10) [1].
>
> > size_t avail_offset;
> > bool ok;
> >
> > vhost_svq_get_vring_addr(svq, &svq_addr);
>
> Over here "svq_addr.desc_user_addr" will point to the descriptor area
> while "svq_addr.avail_user_addr" will point to the driver area/driver
> event suppression structure.
>
> > driver_region = (DMAMap) {
> >     .translated_addr = svq_addr.desc_user_addr,
> >     .size = driver_size - 1,
> >     .perm = IOMMU_RO,
> > };
>
> This region points to the descriptor area and its size encompasses the
> driver area as well with RO permission.
>
> > ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
>
> The above function checks the value of needle->perm and sees that it is RO.
> It then calls "vhost_vdpa_dma_map" with the following arguments:
>
> > r = vhost_vdpa_dma_map(v->shared, v->address_space_id, needle->iova,
> >                                                needle->size + 1,
> >                                                (void *)(uintptr_t)needle->translated_addr,
> >                                                needle->perm == IOMMU_RO);
>
> Since needle->size includes the driver area as well, the driver area will be
> mapped to a RO page in the device's address space, right?
>

Yes, the device does not need to write into the descriptor area in the
supported split virtqueue case. So the descriptor area is also mapped
RO at this moment.

This change in the packed virtqueue case, so we need to map it RW.

> > if (unlikely(!ok)) {
> >     error_prepend(errp, "Cannot create vq driver region: ");
> >     return false;
> > }
> > addr->desc_user_addr = driver_region.iova;
> > avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
> > addr->avail_user_addr = driver_region.iova + avail_offset;
>
> I think "addr->desc_user_addr" and "addr->avail_user_addr" will both be
> mapped to a RO page in the device's address space.
>
> > device_region = (DMAMap) {
> >     .translated_addr = svq_addr.used_user_addr,
> >     .size = device_size - 1,
> >     .perm = IOMMU_RW,
> > };
>
> The device area/device event suppression structure on the other hand will
> be mapped to a RW page.
>
> I also think there are other issues with the current state of the patch. According
> to the virtio spec (section 2.8.10) [1], the "device event suppression" structure
> needs to be write-only for the device but is mapped to a RW page.
>

Yes, I'm not sure if all IOMMU supports write-only maps to be honest.

> Another concern I have is regarding the driver area size for packed vq. In
> "hw/virtio/vhost-shadow-virtqueue.c" of the current patch:
> > size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
> > {
> >     size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
> >     size_t avail_size = offsetof(vring_avail_t, ring[svq->vring.num]) +
> >                                                               sizeof(uint16_t);
> >
> >     return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size());
> > }
> >
> > [...]
> >
> > size_t vhost_svq_memory_packed(const VhostShadowVirtqueue *svq)
> > {
> >     size_t desc_size = sizeof(struct vring_packed_desc) * svq->num_free;
> >     size_t driver_event_suppression = sizeof(struct vring_packed_desc_event);
> >     size_t device_event_suppression = sizeof(struct vring_packed_desc_event);
> >
> >     return ROUND_UP(desc_size + driver_event_suppression + device_event_suppression,
> >                     qemu_real_host_page_size());
> > }
>
> The size returned by "vhost_svq_driver_area_size" might not be the actual driver
> size which is given by desc_size + driver_event_suppression, right? Will this have to
> be changed too?
>

Yes, you're right this needs to be changed too.

> Thanks,
> Sahil
>
> [1] https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-720008
>
>
>
Sahil Aug. 21, 2024, 12:19 p.m. UTC | #7
Hi,

Sorry for the late reply.

On Tuesday, August 13, 2024 12:23:55 PM GMT+5:30 Eugenio Perez Martin wrote:
> [...]
> > I think I have understood what's going on in "vhost_vdpa_svq_map_rings",
> > "vhost_vdpa_svq_map_ring" and "vhost_vdpa_dma_map". But based on
> > what I have understood it looks like the driver area is getting mapped to
> > an iova which is read-only for vhost_vdpa. Please let me know where I am
> > going wrong.
> 
> You're not going wrong there. The device does not need to write into
> this area, so we map it read only.
> 
> > Consider the following implementation in hw/virtio/vhost_vdpa.c:
> > > size_t device_size = vhost_svq_device_area_size(svq);
> > > size_t driver_size = vhost_svq_driver_area_size(svq);
> > 
> > The driver size includes the descriptor area and the driver area. For
> > packed vq, the driver area is the "driver event suppression" structure
> > which should be read-only for the device according to the virtio spec
> > (section 2.8.10) [1].
> > 
> > > size_t avail_offset;
> > > bool ok;
> > > 
> > > vhost_svq_get_vring_addr(svq, &svq_addr);
> > 
> > Over here "svq_addr.desc_user_addr" will point to the descriptor area
> > while "svq_addr.avail_user_addr" will point to the driver area/driver
> > event suppression structure.
> > 
> > > driver_region = (DMAMap) {
> > > 
> > >     .translated_addr = svq_addr.desc_user_addr,
> > >     .size = driver_size - 1,
> > >     .perm = IOMMU_RO,
> > > 
> > > };
> > 
> > This region points to the descriptor area and its size encompasses the
> > driver area as well with RO permission.
> > 
> > > ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
> > 
> > The above function checks the value of needle->perm and sees that it is
> > RO.
> > 
> > It then calls "vhost_vdpa_dma_map" with the following arguments:
> > > r = vhost_vdpa_dma_map(v->shared, v->address_space_id, needle->iova,
> > > 
> > >                                                needle->size + 1,
> > >                                                (void
> > >                                                *)(uintptr_t)needle->tra
> > >                                                nslated_addr,
> > >                                                needle->perm ==
> > >                                                IOMMU_RO);
> > 
> > Since needle->size includes the driver area as well, the driver area will
> > be mapped to a RO page in the device's address space, right?
> 
> Yes, the device does not need to write into the descriptor area in the
> supported split virtqueue case. So the descriptor area is also mapped
> RO at this moment.
> 
> This change in the packed virtqueue case, so we need to map it RW.

I understand this now. I'll see how the implementation can be modified to take
this into account. I'll see if making the driver area and descriptor ring helps.

> > > if (unlikely(!ok)) {
> > > 
> > >     error_prepend(errp, "Cannot create vq driver region: ");
> > >     return false;
> > > 
> > > }
> > > addr->desc_user_addr = driver_region.iova;
> > > avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
> > > addr->avail_user_addr = driver_region.iova + avail_offset;
> > 
> > I think "addr->desc_user_addr" and "addr->avail_user_addr" will both be
> > mapped to a RO page in the device's address space.
> > 
> > > device_region = (DMAMap) {
> > > 
> > >     .translated_addr = svq_addr.used_user_addr,
> > >     .size = device_size - 1,
> > >     .perm = IOMMU_RW,
> > > 
> > > };
> > 
> > The device area/device event suppression structure on the other hand will
> > be mapped to a RW page.
> > 
> > I also think there are other issues with the current state of the patch.
> > According to the virtio spec (section 2.8.10) [1], the "device event
> > suppression" structure needs to be write-only for the device but is
> > mapped to a RW page.
> 
> Yes, I'm not sure if all IOMMU supports write-only maps to be honest.

Got it. I think it should be alright to defer this issue until later.

> > Another concern I have is regarding the driver area size for packed vq. In
> > 
> > "hw/virtio/vhost-shadow-virtqueue.c" of the current patch:
> > > size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
> > > {
> > > 
> > >     size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > >     size_t avail_size = offsetof(vring_avail_t, ring[svq->vring.num]) +
> > >     
> > >                                                               sizeof(uin
> > >                                                               t16_t);
> > >     
> > >     return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size());
> > > 
> > > }
> > > 
> > > [...]
> > > 
> > > size_t vhost_svq_memory_packed(const VhostShadowVirtqueue *svq)
> > > {
> > > 
> > >     size_t desc_size = sizeof(struct vring_packed_desc) * svq->num_free;
> > >     size_t driver_event_suppression = sizeof(struct
> > >     vring_packed_desc_event);
> > >     size_t device_event_suppression = sizeof(struct
> > >     vring_packed_desc_event);
> > >     
> > >     return ROUND_UP(desc_size + driver_event_suppression +
> > >     device_event_suppression,> >     
> > >                     qemu_real_host_page_size());
> > > 
> > > }
> > 
> > The size returned by "vhost_svq_driver_area_size" might not be the actual
> > driver size which is given by desc_size + driver_event_suppression,
> > right? Will this have to be changed too?
> 
> Yes, you're right this needs to be changed too.

Understood. I'll modify this too.

I have been trying to test my changes so far as well. I am not very clear on
a few things.

Q1.
I built QEMU from source with my changes and followed the vdpa_sim +
vhost_vdpa tutorial [1]. The VM seems to be running fine. How do I check
if the packed format is being used instead of the split vq format for shadow
virtqueues? I know the packed format is used when virtio_vdev has got the
VIRTIO_F_RING_PACKED bit enabled. Is there a way of checking that this is
the case?

Q2.
What's the recommended way to see what's going on under the hood? I tried
using the -D option so QEMU's logs are written to a file but the file was empty.
Would using qemu with -monitor stdio or attaching gdb to the QEMU VM be
worthwhile?

Thanks,
Sahil

[1] https://www.redhat.com/en/blog/hands-vdpa-what-do-you-do-when-you-aint-got-hardware-part-1
Eugenio Perez Martin Aug. 27, 2024, 3:30 p.m. UTC | #8
On Wed, Aug 21, 2024 at 2:20 PM Sahil <icegambit91@gmail.com> wrote:
>
> Hi,
>
> Sorry for the late reply.
>
> On Tuesday, August 13, 2024 12:23:55 PM GMT+5:30 Eugenio Perez Martin wrote:
> > [...]
> > > I think I have understood what's going on in "vhost_vdpa_svq_map_rings",
> > > "vhost_vdpa_svq_map_ring" and "vhost_vdpa_dma_map". But based on
> > > what I have understood it looks like the driver area is getting mapped to
> > > an iova which is read-only for vhost_vdpa. Please let me know where I am
> > > going wrong.
> >
> > You're not going wrong there. The device does not need to write into
> > this area, so we map it read only.
> >
> > > Consider the following implementation in hw/virtio/vhost_vdpa.c:
> > > > size_t device_size = vhost_svq_device_area_size(svq);
> > > > size_t driver_size = vhost_svq_driver_area_size(svq);
> > >
> > > The driver size includes the descriptor area and the driver area. For
> > > packed vq, the driver area is the "driver event suppression" structure
> > > which should be read-only for the device according to the virtio spec
> > > (section 2.8.10) [1].
> > >
> > > > size_t avail_offset;
> > > > bool ok;
> > > >
> > > > vhost_svq_get_vring_addr(svq, &svq_addr);
> > >
> > > Over here "svq_addr.desc_user_addr" will point to the descriptor area
> > > while "svq_addr.avail_user_addr" will point to the driver area/driver
> > > event suppression structure.
> > >
> > > > driver_region = (DMAMap) {
> > > >
> > > >     .translated_addr = svq_addr.desc_user_addr,
> > > >     .size = driver_size - 1,
> > > >     .perm = IOMMU_RO,
> > > >
> > > > };
> > >
> > > This region points to the descriptor area and its size encompasses the
> > > driver area as well with RO permission.
> > >
> > > > ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
> > >
> > > The above function checks the value of needle->perm and sees that it is
> > > RO.
> > >
> > > It then calls "vhost_vdpa_dma_map" with the following arguments:
> > > > r = vhost_vdpa_dma_map(v->shared, v->address_space_id, needle->iova,
> > > >
> > > >                                                needle->size + 1,
> > > >                                                (void
> > > >                                                *)(uintptr_t)needle->tra
> > > >                                                nslated_addr,
> > > >                                                needle->perm ==
> > > >                                                IOMMU_RO);
> > >
> > > Since needle->size includes the driver area as well, the driver area will
> > > be mapped to a RO page in the device's address space, right?
> >
> > Yes, the device does not need to write into the descriptor area in the
> > supported split virtqueue case. So the descriptor area is also mapped
> > RO at this moment.
> >
> > This change in the packed virtqueue case, so we need to map it RW.
>
> I understand this now. I'll see how the implementation can be modified to take
> this into account. I'll see if making the driver area and descriptor ring helps.
>
> > > > if (unlikely(!ok)) {
> > > >
> > > >     error_prepend(errp, "Cannot create vq driver region: ");
> > > >     return false;
> > > >
> > > > }
> > > > addr->desc_user_addr = driver_region.iova;
> > > > avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
> > > > addr->avail_user_addr = driver_region.iova + avail_offset;
> > >
> > > I think "addr->desc_user_addr" and "addr->avail_user_addr" will both be
> > > mapped to a RO page in the device's address space.
> > >
> > > > device_region = (DMAMap) {
> > > >
> > > >     .translated_addr = svq_addr.used_user_addr,
> > > >     .size = device_size - 1,
> > > >     .perm = IOMMU_RW,
> > > >
> > > > };
> > >
> > > The device area/device event suppression structure on the other hand will
> > > be mapped to a RW page.
> > >
> > > I also think there are other issues with the current state of the patch.
> > > According to the virtio spec (section 2.8.10) [1], the "device event
> > > suppression" structure needs to be write-only for the device but is
> > > mapped to a RW page.
> >
> > Yes, I'm not sure if all IOMMU supports write-only maps to be honest.
>
> Got it. I think it should be alright to defer this issue until later.
>
> > > Another concern I have is regarding the driver area size for packed vq. In
> > >
> > > "hw/virtio/vhost-shadow-virtqueue.c" of the current patch:
> > > > size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
> > > > {
> > > >
> > > >     size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
> > > >     size_t avail_size = offsetof(vring_avail_t, ring[svq->vring.num]) +
> > > >
> > > >                                                               sizeof(uin
> > > >                                                               t16_t);
> > > >
> > > >     return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size());
> > > >
> > > > }
> > > >
> > > > [...]
> > > >
> > > > size_t vhost_svq_memory_packed(const VhostShadowVirtqueue *svq)
> > > > {
> > > >
> > > >     size_t desc_size = sizeof(struct vring_packed_desc) * svq->num_free;
> > > >     size_t driver_event_suppression = sizeof(struct
> > > >     vring_packed_desc_event);
> > > >     size_t device_event_suppression = sizeof(struct
> > > >     vring_packed_desc_event);
> > > >
> > > >     return ROUND_UP(desc_size + driver_event_suppression +
> > > >     device_event_suppression,> >
> > > >                     qemu_real_host_page_size());
> > > >
> > > > }
> > >
> > > The size returned by "vhost_svq_driver_area_size" might not be the actual
> > > driver size which is given by desc_size + driver_event_suppression,
> > > right? Will this have to be changed too?
> >
> > Yes, you're right this needs to be changed too.
>
> Understood. I'll modify this too.
>
> I have been trying to test my changes so far as well. I am not very clear on
> a few things.
>
> Q1.
> I built QEMU from source with my changes and followed the vdpa_sim +
> vhost_vdpa tutorial [1]. The VM seems to be running fine. How do I check
> if the packed format is being used instead of the split vq format for shadow
> virtqueues? I know the packed format is used when virtio_vdev has got the
> VIRTIO_F_RING_PACKED bit enabled. Is there a way of checking that this is
> the case?
>

You can see the features that the driver acked from the guest by
checking sysfs. Once you know the PCI BFN from lspci:
# lspci -nn|grep '\[1af4:1041\]'
01:00.0 Ethernet controller [0200]: Red Hat, Inc. Virtio 1.0 network
device [1af4:1041] (rev 01)
# cut -c 35 /sys/devices/pci0000:00/0000:00:02.0/0000:01:00.0/virtio0/features
0

Also, you can check from QEMU by simply tracing if your functions are
being called.

> Q2.
> What's the recommended way to see what's going on under the hood? I tried
> using the -D option so QEMU's logs are written to a file but the file was empty.
> Would using qemu with -monitor stdio or attaching gdb to the QEMU VM be
> worthwhile?
>

You need to add --trace options with the regex you want to get to
enable any output. For example, --trace 'vhost_vdpa_*' print all the
trace_vhost_vdpa_* functions.

If you want to speed things up, you can just replace the interesting
trace_... functions with fprintf(stderr, ...). We can add the trace
ones afterwards.
Sahil Aug. 30, 2024, 10:20 a.m. UTC | #9
Hi,

On Tuesday, August 27, 2024 9:00:36 PM GMT+5:30 Eugenio Perez Martin wrote:
> On Wed, Aug 21, 2024 at 2:20 PM Sahil <icegambit91@gmail.com> wrote:
> > [...]
> > I have been trying to test my changes so far as well. I am not very clear
> > on a few things.
> > 
> > Q1.
> > I built QEMU from source with my changes and followed the vdpa_sim +
> > vhost_vdpa tutorial [1]. The VM seems to be running fine. How do I check
> > if the packed format is being used instead of the split vq format for
> > shadow virtqueues? I know the packed format is used when virtio_vdev has
> > got the VIRTIO_F_RING_PACKED bit enabled. Is there a way of checking that
> > this is the case?
> 
> You can see the features that the driver acked from the guest by
> checking sysfs. Once you know the PCI BFN from lspci:
> # lspci -nn|grep '\[1af4:1041\]'
> 01:00.0 Ethernet controller [0200]: Red Hat, Inc. Virtio 1.0 network
> device [1af4:1041] (rev 01)
> # cut -c 35
> /sys/devices/pci0000:00/0000:00:02.0/0000:01:00.0/virtio0/features 0
> 
> Also, you can check from QEMU by simply tracing if your functions are
> being called.
> 
> > Q2.
> > What's the recommended way to see what's going on under the hood? I tried
> > using the -D option so QEMU's logs are written to a file but the file was
> > empty. Would using qemu with -monitor stdio or attaching gdb to the QEMU
> > VM be worthwhile?
> 
> You need to add --trace options with the regex you want to get to
> enable any output. For example, --trace 'vhost_vdpa_*' print all the
> trace_vhost_vdpa_* functions.
> 
> If you want to speed things up, you can just replace the interesting
> trace_... functions with fprintf(stderr, ...). We can add the trace
> ones afterwards.

Understood. I am able to trace the functions that are being called with
fprintf. I'll stick with fprintf for now.

I realized that packed vqs are not being used in the test environment. I
see that in "hw/virtio/vhost-shadow-virtqueue.c", svq->is_packed is set
to 0 and that calls vhost_svq_add_split(). I am not sure how one enables
the packed feature bit. I don't know if this is an environment issue.

I built qemu from the latest source with my changes on top of it. I followed
this article [1] to set up the environment.

On the host machine:

$ uname -a
Linux fedora 6.10.5-100.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Aug 14 15:49:25 UTC 2024 x86_64 GNU/Linux

$ ./qemu/build/qemu-system-x86_64 --version
QEMU emulator version 9.0.91

$ vdpa -V
vdpa utility, iproute2-6.4.0

All the relevant vdpa modules have been loaded in accordance with [1].

$ lsmod | grep -iE "(vdpa|virtio)"
vdpa_sim_net	12288  0
vdpa_sim		24576  1 vdpa_sim_net
vringh		32768  2 vdpa_sim,vdpa_sim_net
vhost_vdpa		32768  2
vhost		65536  1 vhost_vdpa
vhost_iotlb		16384  4 vdpa_sim,vringh,vhost_vdpa,vhost
vdpa		36864  3 vdpa_sim,vhost_vdpa,vdpa_sim_net

$ ls -l /sys/bus/vdpa/devices/vdpa0/driver
lrwxrwxrwx. 1 root root 0 Aug 30 11:25 /sys/bus/vdpa/devices/vdpa0/driver -> ../../bus/vdpa/drivers/vhost_vdpa

In the output of the following command, I see ANY_LAYOUT is supported.
According to virtio_config.h [2] in the linux kernel, this represents the
layout of descriptors. This refers to split and packed vqs, right?

$ vdpa mgmtdev show
vdpasim_net: 
  supported_classes net
  max_supported_vqs 3
  dev_features MTU MAC STATUS CTRL_VQ CTRL_MAC_ADDR ANY_LAYOUT VERSION_1 ACCESS_PLATFORM

$ vdpa dev show -jp
{
    "dev": {
        "vdpa0": {
            "type": "network",
            "mgmtdev": "vdpasim_net",
            "vendor_id": 0,
            "max_vqs": 3,
            "max_vq_size": 256
        }
    }
}

I started the VM by running:

$ sudo ./qemu/build/qemu-system-x86_64 \
-enable-kvm \
-drive file=//home/ig91/fedora_qemu_test_vm/L1.qcow2,media=disk,if=virtio \
-net nic,model=virtio \
-net user,hostfwd=tcp::2226-:22 \
-netdev type=vhost-vdpa,vhostdev=/dev/vhost-vdpa-0,id=vhost-vdpa0 \
-device virtio-net-pci,netdev=vhost-vdpa0,bus=pci.0,addr=0x7,disable-legacy=on,disable-modern=off,page-per-vq=on,event_idx=off,packed=on \
-nographic \
-m 2G \
-smp 2 \
-cpu host \
2>&1 | tee vm.log

I added the packed=on option to -device virtio-net-pci.

In the VM:

# uname -a
Linux fedora 6.8.5-201.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Apr 11 18:25:26 UTC 2024 x86_64 GNU/Linux

# lspci -nn | grep -i -A15 "\[1af4:1041\]"
00:07.0 Ethernet controller [0200]: Red Hat, Inc. Virtio 1.0 network device [1af4:1041] (rev 01)

# cut -c 35 /sys/devices/pci0000:00/0000:00:07.0/virtio1/features
0

The packed vq feature bit hasn't been set. Am I missing something here?

Thanks,
Sahil

[1] https://www.redhat.com/en/blog/hands-vdpa-what-do-you-do-when-you-aint-got-hardware-part-1
[2] https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_config.h#L63
Eugenio Perez Martin Aug. 30, 2024, 10:48 a.m. UTC | #10
On Fri, Aug 30, 2024 at 12:20 PM Sahil <icegambit91@gmail.com> wrote:
>
> Hi,
>
> On Tuesday, August 27, 2024 9:00:36 PM GMT+5:30 Eugenio Perez Martin wrote:
> > On Wed, Aug 21, 2024 at 2:20 PM Sahil <icegambit91@gmail.com> wrote:
> > > [...]
> > > I have been trying to test my changes so far as well. I am not very clear
> > > on a few things.
> > >
> > > Q1.
> > > I built QEMU from source with my changes and followed the vdpa_sim +
> > > vhost_vdpa tutorial [1]. The VM seems to be running fine. How do I check
> > > if the packed format is being used instead of the split vq format for
> > > shadow virtqueues? I know the packed format is used when virtio_vdev has
> > > got the VIRTIO_F_RING_PACKED bit enabled. Is there a way of checking that
> > > this is the case?
> >
> > You can see the features that the driver acked from the guest by
> > checking sysfs. Once you know the PCI BFN from lspci:
> > # lspci -nn|grep '\[1af4:1041\]'
> > 01:00.0 Ethernet controller [0200]: Red Hat, Inc. Virtio 1.0 network
> > device [1af4:1041] (rev 01)
> > # cut -c 35
> > /sys/devices/pci0000:00/0000:00:02.0/0000:01:00.0/virtio0/features 0
> >
> > Also, you can check from QEMU by simply tracing if your functions are
> > being called.
> >
> > > Q2.
> > > What's the recommended way to see what's going on under the hood? I tried
> > > using the -D option so QEMU's logs are written to a file but the file was
> > > empty. Would using qemu with -monitor stdio or attaching gdb to the QEMU
> > > VM be worthwhile?
> >
> > You need to add --trace options with the regex you want to get to
> > enable any output. For example, --trace 'vhost_vdpa_*' print all the
> > trace_vhost_vdpa_* functions.
> >
> > If you want to speed things up, you can just replace the interesting
> > trace_... functions with fprintf(stderr, ...). We can add the trace
> > ones afterwards.
>
> Understood. I am able to trace the functions that are being called with
> fprintf. I'll stick with fprintf for now.
>
> I realized that packed vqs are not being used in the test environment. I
> see that in "hw/virtio/vhost-shadow-virtqueue.c", svq->is_packed is set
> to 0 and that calls vhost_svq_add_split(). I am not sure how one enables
> the packed feature bit. I don't know if this is an environment issue.
>
> I built qemu from the latest source with my changes on top of it. I followed
> this article [1] to set up the environment.
>
> On the host machine:
>
> $ uname -a
> Linux fedora 6.10.5-100.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Aug 14 15:49:25 UTC 2024 x86_64 GNU/Linux
>
> $ ./qemu/build/qemu-system-x86_64 --version
> QEMU emulator version 9.0.91
>
> $ vdpa -V
> vdpa utility, iproute2-6.4.0
>
> All the relevant vdpa modules have been loaded in accordance with [1].
>
> $ lsmod | grep -iE "(vdpa|virtio)"
> vdpa_sim_net    12288  0
> vdpa_sim                24576  1 vdpa_sim_net
> vringh          32768  2 vdpa_sim,vdpa_sim_net
> vhost_vdpa              32768  2
> vhost           65536  1 vhost_vdpa
> vhost_iotlb             16384  4 vdpa_sim,vringh,vhost_vdpa,vhost
> vdpa            36864  3 vdpa_sim,vhost_vdpa,vdpa_sim_net
>
> $ ls -l /sys/bus/vdpa/devices/vdpa0/driver
> lrwxrwxrwx. 1 root root 0 Aug 30 11:25 /sys/bus/vdpa/devices/vdpa0/driver -> ../../bus/vdpa/drivers/vhost_vdpa
>
> In the output of the following command, I see ANY_LAYOUT is supported.
> According to virtio_config.h [2] in the linux kernel, this represents the
> layout of descriptors. This refers to split and packed vqs, right?
>
> $ vdpa mgmtdev show
> vdpasim_net:
>   supported_classes net
>   max_supported_vqs 3
>   dev_features MTU MAC STATUS CTRL_VQ CTRL_MAC_ADDR ANY_LAYOUT VERSION_1 ACCESS_PLATFORM
>
> $ vdpa dev show -jp
> {
>     "dev": {
>         "vdpa0": {
>             "type": "network",
>             "mgmtdev": "vdpasim_net",
>             "vendor_id": 0,
>             "max_vqs": 3,
>             "max_vq_size": 256
>         }
>     }
> }
>
> I started the VM by running:
>
> $ sudo ./qemu/build/qemu-system-x86_64 \
> -enable-kvm \
> -drive file=//home/ig91/fedora_qemu_test_vm/L1.qcow2,media=disk,if=virtio \
> -net nic,model=virtio \
> -net user,hostfwd=tcp::2226-:22 \
> -netdev type=vhost-vdpa,vhostdev=/dev/vhost-vdpa-0,id=vhost-vdpa0 \
> -device virtio-net-pci,netdev=vhost-vdpa0,bus=pci.0,addr=0x7,disable-legacy=on,disable-modern=off,page-per-vq=on,event_idx=off,packed=on \
> -nographic \
> -m 2G \
> -smp 2 \
> -cpu host \
> 2>&1 | tee vm.log
>
> I added the packed=on option to -device virtio-net-pci.
>
> In the VM:
>
> # uname -a
> Linux fedora 6.8.5-201.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Apr 11 18:25:26 UTC 2024 x86_64 GNU/Linux
>
> # lspci -nn | grep -i -A15 "\[1af4:1041\]"
> 00:07.0 Ethernet controller [0200]: Red Hat, Inc. Virtio 1.0 network device [1af4:1041] (rev 01)
>
> # cut -c 35 /sys/devices/pci0000:00/0000:00:07.0/virtio1/features
> 0
>
> The packed vq feature bit hasn't been set. Am I missing something here?
>

vdpa_sim does not support packed vq at the moment. You need to build
the use case #3 of the second part of that blog [1]. It's good that
you build the vdpa_sim earlier as it is a simpler setup.

If you have problems with the vp_vdpa environment please let me know
so we can find alternative setups.

Thanks!

[1] https://www.redhat.com/en/blog/hands-vdpa-what-do-you-do-when-you-aint-got-hardware-part-2

> Thanks,
> Sahil
>
> [1] https://www.redhat.com/en/blog/hands-vdpa-what-do-you-do-when-you-aint-got-hardware-part-1
> [2] https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_config.h#L63
>
>
>
Sahil Sept. 8, 2024, 7:46 p.m. UTC | #11
Hi,

On Friday, August 30, 2024 4:18:31 PM GMT+5:30 Eugenio Perez Martin wrote:
> On Fri, Aug 30, 2024 at 12:20 PM Sahil <icegambit91@gmail.com> wrote:
> > Hi,
> > 
> > On Tuesday, August 27, 2024 9:00:36 PM GMT+5:30 Eugenio Perez Martin wrote:
> > > On Wed, Aug 21, 2024 at 2:20 PM Sahil <icegambit91@gmail.com> wrote:
> > > > [...]
> > > > I have been trying to test my changes so far as well. I am not very
> > > > clear
> > > > on a few things.
> > > > 
> > > > Q1.
> > > > I built QEMU from source with my changes and followed the vdpa_sim +
> > > > vhost_vdpa tutorial [1]. The VM seems to be running fine. How do I
> > > > check
> > > > if the packed format is being used instead of the split vq format for
> > > > shadow virtqueues? I know the packed format is used when virtio_vdev
> > > > has
> > > > got the VIRTIO_F_RING_PACKED bit enabled. Is there a way of checking
> > > > that
> > > > this is the case?
> > > 
> > > You can see the features that the driver acked from the guest by
> > > checking sysfs. Once you know the PCI BFN from lspci:
> > > # lspci -nn|grep '\[1af4:1041\]'
> > > 01:00.0 Ethernet controller [0200]: Red Hat, Inc. Virtio 1.0 network
> > > device [1af4:1041] (rev 01)
> > > # cut -c 35
> > > /sys/devices/pci0000:00/0000:00:02.0/0000:01:00.0/virtio0/features 0
> > > 
> > > Also, you can check from QEMU by simply tracing if your functions are
> > > being called.
> > > 
> > > > Q2.
> > > > What's the recommended way to see what's going on under the hood? I
> > > > tried
> > > > using the -D option so QEMU's logs are written to a file but the file
> > > > was
> > > > empty. Would using qemu with -monitor stdio or attaching gdb to the
> > > > QEMU
> > > > VM be worthwhile?
> > > 
> > > You need to add --trace options with the regex you want to get to
> > > enable any output. For example, --trace 'vhost_vdpa_*' print all the
> > > trace_vhost_vdpa_* functions.
> > > 
> > > If you want to speed things up, you can just replace the interesting
> > > trace_... functions with fprintf(stderr, ...). We can add the trace
> > > ones afterwards.
> > 
> > Understood. I am able to trace the functions that are being called with
> > fprintf. I'll stick with fprintf for now.
> > 
> > I realized that packed vqs are not being used in the test environment. I
> > see that in "hw/virtio/vhost-shadow-virtqueue.c", svq->is_packed is set
> > to 0 and that calls vhost_svq_add_split(). I am not sure how one enables
> > the packed feature bit. I don't know if this is an environment issue.
> > 
> > I built qemu from the latest source with my changes on top of it. I
> > followed this article [1] to set up the environment.
> > 
> > On the host machine:
> > 
> > $ uname -a
> > Linux fedora 6.10.5-100.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Aug 14
> > 15:49:25 UTC 2024 x86_64 GNU/Linux
> > 
> > $ ./qemu/build/qemu-system-x86_64 --version
> > QEMU emulator version 9.0.91
> > 
> > $ vdpa -V
> > vdpa utility, iproute2-6.4.0
> > 
> > All the relevant vdpa modules have been loaded in accordance with [1].
> > 
> > $ lsmod | grep -iE "(vdpa|virtio)"
> > vdpa_sim_net    12288  0
> > vdpa_sim                24576  1 vdpa_sim_net
> > vringh          32768  2 vdpa_sim,vdpa_sim_net
> > vhost_vdpa              32768  2
> > vhost           65536  1 vhost_vdpa
> > vhost_iotlb             16384  4 vdpa_sim,vringh,vhost_vdpa,vhost
> > vdpa            36864  3 vdpa_sim,vhost_vdpa,vdpa_sim_net
> > 
> > $ ls -l /sys/bus/vdpa/devices/vdpa0/driver
> > lrwxrwxrwx. 1 root root 0 Aug 30 11:25 /sys/bus/vdpa/devices/vdpa0/driver
> > -> ../../bus/vdpa/drivers/vhost_vdpa
> > 
> > In the output of the following command, I see ANY_LAYOUT is supported.
> > According to virtio_config.h [2] in the linux kernel, this represents the
> > layout of descriptors. This refers to split and packed vqs, right?
> > 
> > $ vdpa mgmtdev show
> > 
> > vdpasim_net:
> >   supported_classes net
> >   max_supported_vqs 3
> >   dev_features MTU MAC STATUS CTRL_VQ CTRL_MAC_ADDR ANY_LAYOUT VERSION_1
> >   ACCESS_PLATFORM> 
> > $ vdpa dev show -jp
> > {
> > 
> >     "dev": {
> >     
> >         "vdpa0": {
> >         
> >             "type": "network",
> >             "mgmtdev": "vdpasim_net",
> >             "vendor_id": 0,
> >             "max_vqs": 3,
> >             "max_vq_size": 256
> >         
> >         }
> >     
> >     }
> > 
> > }
> > 
> > I started the VM by running:
> > 
> > $ sudo ./qemu/build/qemu-system-x86_64 \
> > -enable-kvm \
> > -drive file=//home/ig91/fedora_qemu_test_vm/L1.qcow2,media=disk,if=virtio
> > \
> > -net nic,model=virtio \
> > -net user,hostfwd=tcp::2226-:22 \
> > -netdev type=vhost-vdpa,vhostdev=/dev/vhost-vdpa-0,id=vhost-vdpa0 \
> > -device
> > virtio-net-pci,netdev=vhost-vdpa0,bus=pci.0,addr=0x7,disable-legacy=on,di
> > sable-modern=off,page-per-vq=on,event_idx=off,packed=on \ -nographic \
> > -m 2G \
> > -smp 2 \
> > -cpu host \
> > 2>&1 | tee vm.log
> > 
> > I added the packed=on option to -device virtio-net-pci.
> > 
> > In the VM:
> > 
> > # uname -a
> > Linux fedora 6.8.5-201.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Apr 11
> > 18:25:26 UTC 2024 x86_64 GNU/Linux
> > 
> > # lspci -nn | grep -i -A15 "\[1af4:1041\]"
> > 00:07.0 Ethernet controller [0200]: Red Hat, Inc. Virtio 1.0 network
> > device [1af4:1041] (rev 01)
> > 
> > # cut -c 35 /sys/devices/pci0000:00/0000:00:07.0/virtio1/features
> > 0
> > 
> > The packed vq feature bit hasn't been set. Am I missing something here?
> 
> vdpa_sim does not support packed vq at the moment. You need to build
> the use case #3 of the second part of that blog [1]. It's good that
> you build the vdpa_sim earlier as it is a simpler setup.
> 
> If you have problems with the vp_vdpa environment please let me know
> so we can find alternative setups.

Thank you for the clarification. I tried setting up the vp_vdpa
environment (scenario 3) but I ended up running into a problem
in the L1 VM.

I verified that nesting is enabled in KVM (L0):

$ grep -oE "(vmx|svm)" /proc/cpuinfo | sort | uniq
vmx

$ cat /sys/module/kvm_intel/parameters/nested
Y

There are no issues when booting L1. I start the VM by running:

$ sudo ./qemu/build/qemu-system-x86_64 \
-enable-kvm \
-drive file=//home/ig91/fedora_qemu_test_vm/L1.qcow2,media=disk,if=virtio \
-net nic,model=virtio \
-net user,hostfwd=tcp::2222-:22 \
-device intel-iommu,snoop-control=on \
-device virtio-net-pci,netdev=net0,disable-legacy=on,disable-modern=off,iommu_platform=on,event_idx=off,packed=on,bus=pcie.0,addr=0x4 \
-netdev tap,id=net0,script=no,downscript=no \
-nographic \
-m 2G \
-smp 2 \
-M q35 \
-cpu host \
2>&1 | tee vm.log

Kernel version in L1:

# uname -a
Linux fedora 6.8.5-201.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Apr 11 18:25:26 UTC 2024 x86_64 GNU/Linux

The following variables are set in the kernel's config as
described in the blog [1]:

CONFIG_VIRTIO_VDPA=m
CONFIG_VDPA=m
CONFIG_VP_VDPA=m
CONFIG_VHOST_VDPA=m

The vDPA tool also satisfies the version criterion.

# vdpa -V
vdpa utility, iproute2-6.10.0

I built QEMU from source with my changes on top of it.

# ./qemu/build/qemu-system-x86_64 --version
QEMU emulator version 9.0.91

The relevant vdpa modules are loaded successfully as
explained in the blog.

# lsmod | grep -i vdpa
vp_vdpa      	20480  0
vhost_vdpa	32768  0
vhost            	65536  1 vhost_vdpa
vhost_iotlb    16384  2 vhost_vdpa,vhost
vdpa               36864  2 vp_vdpa,vhost_vdpa
irqbypass   	12288  2 vhost_vdpa,kvm

# lspci | grep -i virtio
00:03.0 SCSI storage controller: Red Hat, Inc. Virtio block device
00:04.0 Ethernet controller: Red Hat, Inc. Virtio 1.0 network device (rev 01)

# lspci -n |grep 00:04.0
00:04.0 0200: 1af4:1041 (rev 01)

I then unbind the virtio-pci device from the virtio-pci
driver and bind it to the vp_vdpa driver.

# echo 0000:00:04.0 > /sys/bus/pci/drivers/virtio-pci/unbind
# echo 1af4 1041 > /sys/bus/pci/drivers/vp-vdpa/new_id

I then create the vDPA device without any issues.

# vdpa mgmtdev show
pci/0000:00:04.0: 
  supported_classes net
  max_supported_vqs 3
  dev_features CSUM GUEST_CSUM CTRL_GUEST_OFFLOADS MAC GUEST_TSO4 GUEST_TSO6 GUEST_ECN GUEST_UFO HOST_TSO4 HOST_TSO6 HOST_ECN HOST_UFO MRG_RXBUF STATUS CTRL_VQ CTRL_RX CTRL_VLAN CTRL_RX_EXTRA GUEST_ANNOUNCE CTRL_MAC_ADDR RING_INDIRECT_DE6

# vdpa dev add name vdpa0 mgmtdev pci/0000:00:04.0
# vdpa dev show -jp
{
    "dev": {
        "vdpa0": {
            "type": "network",
            "mgmtdev": "pci/0000:00:04.0",
            "vendor_id": 6900,
            "max_vqs": 3,
            "max_vq_size": 256
        }
    }
}

# ls -l /sys/bus/vdpa/devices/vdpa0/driver
lrwxrwxrwx. 1 root root 0 Sep  8 18:58 /sys/bus/vdpa/devices/vdpa0/driver -> ../../../../bus/vdpa/drivers/vhost_vdpa

# ls -l /dev/ |grep vdpa
crw-------. 1 root root    239,   0 Sep  8 18:58 vhost-vdpa-0

# driverctl -b vdpa list-devices
vdpa0 vhost_vdpa

I have a qcow2 image L2.qcow in L1. QEMU throws an error
when trying to launch L2.

# sudo ./qemu/build/qemu-system-x86_64 \
-enable-kvm \
-drive file=//root/L2.qcow2,media=disk,if=virtio \
-netdev type=vhost-vdpa,vhostdev=/dev/vhost-vdpa-0,id=vhost-vdpa0 \
-device virtio-net-pci,netdev=vhost-vdpa0,bus=pcie.0,addr=0x7,disable-legacy=on,disable-modern=off,event_idx=off,packed=on \
-nographic \
-m 2G \
-smp 2 \
-M q35 \
-cpu host \
2>&1 | tee vm.log

qemu-system-x86_64: -netdev type=vhost-vdpa,vhostdev=/dev/vhost-vdpa-0,id=vhost-vdpa0: Could not open '/dev/vhost-vdpa-0': Unknown error 524

I get the same error when trying to launch L2 with qemu-kvm
which I installed using "dnf install".

# qemu-kvm --version
QEMU emulator version 8.1.3 (qemu-8.1.3-5.fc39)

The minimum version of QEMU required is v7.0.0-rc4.

According to "include/linux/errno.h" [2], errno 524 is
ENOTSUPP (operation is not supported). I am not sure
where I am going wrong.

However, I managed to set up scenario 4 successfully
and I see that packed vq is enabled in this case.

# cut -c 35 /sys/devices/pci0000:00/0000:00:04.0/virtio1/features
1

For the time being, shall I simply continue testing with
scenario 4?

> Thanks!
> 
> [1]
> https://www.redhat.com/en/blog/hands-vdpa-what-do-you-do-when-you-aint-got-
> hardware-part-2
> > Thanks,
> > Sahil
> > 
> > [1]
> > https://www.redhat.com/en/blog/hands-vdpa-what-do-you-do-when-you-aint-go
> > t-hardware-part-1 [2]
> > https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_c
> > onfig.h#L63

Thanks,
Sahil

[1] https://www.redhat.com/en/blog/hands-vdpa-what-do-you-do-when-you-aint-got-hardware-part-2
[2] https://github.com/torvalds/linux/blob/master/include/linux/errno.h#L27
diff mbox series

Patch

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index 4c308ee53d..f4285db2b4 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -645,6 +645,8 @@  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 
 /**
  * Get the shadow vq vring address.
+ * This is used irrespective of whether the
+ * split or packed vq format is used.
  * @svq: Shadow virtqueue
  * @addr: Destination to store address
  */
@@ -672,6 +674,16 @@  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
     return ROUND_UP(used_size, qemu_real_host_page_size());
 }
 
+size_t vhost_svq_memory_packed(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(struct vring_packed_desc) * svq->num_free;
+    size_t driver_event_suppression = sizeof(struct vring_packed_desc_event);
+    size_t device_event_suppression = sizeof(struct vring_packed_desc_event);
+
+    return ROUND_UP(desc_size + driver_event_suppression + device_event_suppression,
+                    qemu_real_host_page_size());
+}
+
 /**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
@@ -726,17 +738,30 @@  void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 
     svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
     svq->num_free = svq->vring.num;
-    svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
-                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
-                           -1, 0);
-    desc_size = sizeof(vring_desc_t) * svq->vring.num;
-    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
-    svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
-                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
-                           -1, 0);
-    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
-    svq->desc_next = g_new0(uint16_t, svq->vring.num);
-    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+    svq->is_packed = virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED);
+
+    if (virtio_vdev_has_feature(svq->vdev, VIRTIO_F_RING_PACKED)) {
+        svq->vring_packed.vring.desc = mmap(NULL, vhost_svq_memory_packed(svq),
+                                            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+                                            -1, 0);
+        desc_size = sizeof(struct vring_packed_desc) * svq->vring.num;
+        svq->vring_packed.vring.driver = (void *)((char *)svq->vring_packed.vring.desc + desc_size);
+        svq->vring_packed.vring.device = (void *)((char *)svq->vring_packed.vring.driver +
+                                                  sizeof(struct vring_packed_desc_event));
+    } else {
+        svq->vring.desc = mmap(NULL, vhost_svq_driver_area_size(svq),
+                               PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+                               -1, 0);
+        desc_size = sizeof(vring_desc_t) * svq->vring.num;
+        svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+        svq->vring.used = mmap(NULL, vhost_svq_device_area_size(svq),
+                               PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+                               -1, 0);
+    }
+
+    svq->desc_state = g_new0(SVQDescState, svq->num_free);
+    svq->desc_next = g_new0(uint16_t, svq->num_free);
+    for (unsigned i = 0; i < svq->num_free - 1; i++) {
         svq->desc_next[i] = cpu_to_le16(i + 1);
     }
 }
@@ -776,8 +801,13 @@  void vhost_svq_stop(VhostShadowVirtqueue *svq)
     svq->vq = NULL;
     g_free(svq->desc_next);
     g_free(svq->desc_state);
-    munmap(svq->vring.desc, vhost_svq_driver_area_size(svq));
-    munmap(svq->vring.used, vhost_svq_device_area_size(svq));
+
+    if (svq->is_packed) {
+        munmap(svq->vring_packed.vring.desc, vhost_svq_memory_packed(svq));
+    } else {
+        munmap(svq->vring.desc, vhost_svq_driver_area_size(svq));
+        munmap(svq->vring.used, vhost_svq_device_area_size(svq));
+    }
     event_notifier_set_handler(&svq->hdev_call, NULL);
 }
 
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index ee1a87f523..03b722a186 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -67,6 +67,9 @@  struct vring_packed {
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* True if packed virtqueue */
+    bool is_packed;
+
     /* Virtio queue shadowing */
     VirtQueue *vq;
 
@@ -150,6 +153,7 @@  void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                               struct vhost_vring_addr *addr);
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_memory_packed(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq, VhostIOVATree *iova_tree);