diff mbox series

[v5,4/5] vfio-user: Message-based DMA support

Message ID 20230920080622.3600226-5-mnissler@rivosinc.com
State New
Headers show
Series Support message-based DMA in vfio-user server | expand

Commit Message

Mattias Nissler Sept. 20, 2023, 8:06 a.m. UTC
Wire up support for DMA for the case where the vfio-user client does not
provide mmap()-able file descriptors, but DMA requests must be performed
via the VFIO-user protocol. This installs an indirect memory region,
which already works for pci_dma_{read,write}, and pci_dma_map works
thanks to the existing DMA bounce buffering support.

Note that while simple scenarios work with this patch, there's a known
race condition in libvfio-user that will mess up the communication
channel. See https://github.com/nutanix/libvfio-user/issues/279 for
details as well as a proposed fix.

Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
---
 hw/remote/trace-events    |  2 +
 hw/remote/vfio-user-obj.c | 84 +++++++++++++++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 7 deletions(-)

Comments

Jag Raman Oct. 4, 2023, 2:54 p.m. UTC | #1
> On Sep 20, 2023, at 4:06 AM, Mattias Nissler <mnissler@rivosinc.com> wrote:
> 
> Wire up support for DMA for the case where the vfio-user client does not
> provide mmap()-able file descriptors, but DMA requests must be performed
> via the VFIO-user protocol. This installs an indirect memory region,
> which already works for pci_dma_{read,write}, and pci_dma_map works
> thanks to the existing DMA bounce buffering support.
> 
> Note that while simple scenarios work with this patch, there's a known
> race condition in libvfio-user that will mess up the communication
> channel. See https://github.com/nutanix/libvfio-user/issues/279 for
> details as well as a proposed fix.
> 
> Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> ---
> hw/remote/trace-events    |  2 +
> hw/remote/vfio-user-obj.c | 84 +++++++++++++++++++++++++++++++++++----
> 2 files changed, 79 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/remote/trace-events b/hw/remote/trace-events
> index 0d1b7d56a5..358a68fb34 100644
> --- a/hw/remote/trace-events
> +++ b/hw/remote/trace-events
> @@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x -> 0x%x"
> vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x"
> vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes"
> vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
> +vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes"
> +vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu bytes"
> vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
> vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
> vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> index 8b10c32a3c..6a561f7969 100644
> --- a/hw/remote/vfio-user-obj.c
> +++ b/hw/remote/vfio-user-obj.c
> @@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
>     return count;
> }
> 
> +static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val,
> +                                unsigned size, MemTxAttrs attrs)
> +{
> +    MemoryRegion *region = opaque;
> +    vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
> +    uint8_t buf[sizeof(uint64_t)];
> +
> +    trace_vfu_dma_read(region->addr + addr, size);
> +
> +    g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
> +    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> +    if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 ||
> +        vfu_sgl_read(vfu_ctx, sg, 1, buf) != 0) {
> +        return MEMTX_ERROR;
> +    }
> +
> +    *val = ldn_he_p(buf, size);
> +
> +    return MEMTX_OK;
> +}
> +
> +static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val,
> +                                 unsigned size, MemTxAttrs attrs)
> +{
> +    MemoryRegion *region = opaque;
> +    vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
> +    uint8_t buf[sizeof(uint64_t)];
> +
> +    trace_vfu_dma_write(region->addr + addr, size);
> +
> +    stn_he_p(buf, size, val);
> +
> +    g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
> +    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> +    if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 ||
> +        vfu_sgl_write(vfu_ctx, sg, 1, buf) != 0) {
> +        return MEMTX_ERROR;
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps vfu_dma_ops = {
> +    .read_with_attrs = vfu_dma_read,
> +    .write_with_attrs = vfu_dma_write,
> +    .endianness = DEVICE_HOST_ENDIAN,
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = true,
> +    },
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    },
> +};
> +
> static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
> {
>     VfuObject *o = vfu_get_private(vfu_ctx);
> @@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>     g_autofree char *name = NULL;
>     struct iovec *iov = &info->iova;
> 
> -    if (!info->vaddr) {
> -        return;
> -    }
> -
>     name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
> -                           (uint64_t)info->vaddr);
> +                           (uint64_t)iov->iov_base);
> 
>     subregion = g_new0(MemoryRegion, 1);
> 
> -    memory_region_init_ram_ptr(subregion, NULL, name,
> -                               iov->iov_len, info->vaddr);
> +    if (info->vaddr) {
> +        memory_region_init_ram_ptr(subregion, OBJECT(o), name,
> +                                   iov->iov_len, info->vaddr);
> +    } else {
> +        /*
> +         * Note that I/O regions' MemoryRegionOps handle accesses of at most 8
> +         * bytes at a time, and larger accesses are broken down. However,
> +         * many/most DMA accesses are larger than 8 bytes and VFIO-user can
> +         * handle large DMA accesses just fine, thus this size restriction
> +         * unnecessarily hurts performance, in particular given that each
> +         * access causes a round trip on the VFIO-user socket.
> +         *
> +         * TODO: Investigate how to plumb larger accesses through memory
> +         * regions, possibly by amending MemoryRegionOps or by creating a new
> +         * memory region type.
> +         */
> +        memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion,
> +                              name, iov->iov_len);

Hi Mattias,

We should update dma_unregister() to ensure we remove this subregion.
dma_unregister() presently removes the RAM region, but not this one.

--
Jag

> +    }
> 
>     dma_as = pci_device_iommu_address_space(o->pci_dev);
> 
> -- 
> 2.34.1
>
Mattias Nissler Oct. 6, 2023, 7:56 a.m. UTC | #2
On Wed, Oct 4, 2023 at 4:54 PM Jag Raman <jag.raman@oracle.com> wrote:

>
>
> > On Sep 20, 2023, at 4:06 AM, Mattias Nissler <mnissler@rivosinc.com>
> wrote:
> >
> > Wire up support for DMA for the case where the vfio-user client does not
> > provide mmap()-able file descriptors, but DMA requests must be performed
> > via the VFIO-user protocol. This installs an indirect memory region,
> > which already works for pci_dma_{read,write}, and pci_dma_map works
> > thanks to the existing DMA bounce buffering support.
> >
> > Note that while simple scenarios work with this patch, there's a known
> > race condition in libvfio-user that will mess up the communication
> > channel. See https://github.com/nutanix/libvfio-user/issues/279 for
> > details as well as a proposed fix.
> >
> > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> > ---
> > hw/remote/trace-events    |  2 +
> > hw/remote/vfio-user-obj.c | 84 +++++++++++++++++++++++++++++++++++----
> > 2 files changed, 79 insertions(+), 7 deletions(-)
> >
> > diff --git a/hw/remote/trace-events b/hw/remote/trace-events
> > index 0d1b7d56a5..358a68fb34 100644
> > --- a/hw/remote/trace-events
> > +++ b/hw/remote/trace-events
> > @@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg:
> 0x%x -> 0x%x"
> > vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x"
> > vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA
> 0x%"PRIx64", %zu bytes"
> > vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
> > +vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu
> bytes"
> > +vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64",
> %zu bytes"
> > vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr
> 0x%"PRIx64" size 0x%"PRIx64""
> > vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR
> address 0x%"PRIx64""
> > vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR
> address 0x%"PRIx64""
> > diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> > index 8b10c32a3c..6a561f7969 100644
> > --- a/hw/remote/vfio-user-obj.c
> > +++ b/hw/remote/vfio-user-obj.c
> > @@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t
> *vfu_ctx, char * const buf,
> >     return count;
> > }
> >
> > +static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t
> *val,
> > +                                unsigned size, MemTxAttrs attrs)
> > +{
> > +    MemoryRegion *region = opaque;
> > +    vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
> > +    uint8_t buf[sizeof(uint64_t)];
> > +
> > +    trace_vfu_dma_read(region->addr + addr, size);
> > +
> > +    g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
> > +    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> > +    if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0
> ||
> > +        vfu_sgl_read(vfu_ctx, sg, 1, buf) != 0) {
> > +        return MEMTX_ERROR;
> > +    }
> > +
> > +    *val = ldn_he_p(buf, size);
> > +
> > +    return MEMTX_OK;
> > +}
> > +
> > +static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t
> val,
> > +                                 unsigned size, MemTxAttrs attrs)
> > +{
> > +    MemoryRegion *region = opaque;
> > +    vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
> > +    uint8_t buf[sizeof(uint64_t)];
> > +
> > +    trace_vfu_dma_write(region->addr + addr, size);
> > +
> > +    stn_he_p(buf, size, val);
> > +
> > +    g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
> > +    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> > +    if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0
> ||
> > +        vfu_sgl_write(vfu_ctx, sg, 1, buf) != 0) {
> > +        return MEMTX_ERROR;
> > +    }
> > +
> > +    return MEMTX_OK;
> > +}
> > +
> > +static const MemoryRegionOps vfu_dma_ops = {
> > +    .read_with_attrs = vfu_dma_read,
> > +    .write_with_attrs = vfu_dma_write,
> > +    .endianness = DEVICE_HOST_ENDIAN,
> > +    .valid = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 8,
> > +        .unaligned = true,
> > +    },
> > +    .impl = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 8,
> > +    },
> > +};
> > +
> > static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
> > {
> >     VfuObject *o = vfu_get_private(vfu_ctx);
> > @@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx,
> vfu_dma_info_t *info)
> >     g_autofree char *name = NULL;
> >     struct iovec *iov = &info->iova;
> >
> > -    if (!info->vaddr) {
> > -        return;
> > -    }
> > -
> >     name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
> > -                           (uint64_t)info->vaddr);
> > +                           (uint64_t)iov->iov_base);
> >
> >     subregion = g_new0(MemoryRegion, 1);
> >
> > -    memory_region_init_ram_ptr(subregion, NULL, name,
> > -                               iov->iov_len, info->vaddr);
> > +    if (info->vaddr) {
> > +        memory_region_init_ram_ptr(subregion, OBJECT(o), name,
> > +                                   iov->iov_len, info->vaddr);
> > +    } else {
> > +        /*
> > +         * Note that I/O regions' MemoryRegionOps handle accesses of at
> most 8
> > +         * bytes at a time, and larger accesses are broken down.
> However,
> > +         * many/most DMA accesses are larger than 8 bytes and VFIO-user
> can
> > +         * handle large DMA accesses just fine, thus this size
> restriction
> > +         * unnecessarily hurts performance, in particular given that
> each
> > +         * access causes a round trip on the VFIO-user socket.
> > +         *
> > +         * TODO: Investigate how to plumb larger accesses through memory
> > +         * regions, possibly by amending MemoryRegionOps or by creating
> a new
> > +         * memory region type.
> > +         */
> > +        memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops,
> subregion,
> > +                              name, iov->iov_len);
>
> Hi Mattias,
>
> We should update dma_unregister() to ensure we remove this subregion.
> dma_unregister() presently removes the RAM region, but not this one.
>

Oops. Fixing.


>
> --
> Jag
>
> > +    }
> >
> >     dma_as = pci_device_iommu_address_space(o->pci_dev);
> >
> > --
> > 2.34.1
> >
>
>
diff mbox series

Patch

diff --git a/hw/remote/trace-events b/hw/remote/trace-events
index 0d1b7d56a5..358a68fb34 100644
--- a/hw/remote/trace-events
+++ b/hw/remote/trace-events
@@ -9,6 +9,8 @@  vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x -> 0x%x"
 vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x"
 vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes"
 vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
+vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes"
+vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu bytes"
 vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
 vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
 vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 8b10c32a3c..6a561f7969 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -300,6 +300,63 @@  static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
     return count;
 }
 
+static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val,
+                                unsigned size, MemTxAttrs attrs)
+{
+    MemoryRegion *region = opaque;
+    vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
+    uint8_t buf[sizeof(uint64_t)];
+
+    trace_vfu_dma_read(region->addr + addr, size);
+
+    g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
+    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
+    if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 ||
+        vfu_sgl_read(vfu_ctx, sg, 1, buf) != 0) {
+        return MEMTX_ERROR;
+    }
+
+    *val = ldn_he_p(buf, size);
+
+    return MEMTX_OK;
+}
+
+static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val,
+                                 unsigned size, MemTxAttrs attrs)
+{
+    MemoryRegion *region = opaque;
+    vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
+    uint8_t buf[sizeof(uint64_t)];
+
+    trace_vfu_dma_write(region->addr + addr, size);
+
+    stn_he_p(buf, size, val);
+
+    g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
+    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
+    if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 ||
+        vfu_sgl_write(vfu_ctx, sg, 1, buf) != 0) {
+        return MEMTX_ERROR;
+    }
+
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps vfu_dma_ops = {
+    .read_with_attrs = vfu_dma_read,
+    .write_with_attrs = vfu_dma_write,
+    .endianness = DEVICE_HOST_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = true,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    },
+};
+
 static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
 {
     VfuObject *o = vfu_get_private(vfu_ctx);
@@ -308,17 +365,30 @@  static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
     g_autofree char *name = NULL;
     struct iovec *iov = &info->iova;
 
-    if (!info->vaddr) {
-        return;
-    }
-
     name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
-                           (uint64_t)info->vaddr);
+                           (uint64_t)iov->iov_base);
 
     subregion = g_new0(MemoryRegion, 1);
 
-    memory_region_init_ram_ptr(subregion, NULL, name,
-                               iov->iov_len, info->vaddr);
+    if (info->vaddr) {
+        memory_region_init_ram_ptr(subregion, OBJECT(o), name,
+                                   iov->iov_len, info->vaddr);
+    } else {
+        /*
+         * Note that I/O regions' MemoryRegionOps handle accesses of at most 8
+         * bytes at a time, and larger accesses are broken down. However,
+         * many/most DMA accesses are larger than 8 bytes and VFIO-user can
+         * handle large DMA accesses just fine, thus this size restriction
+         * unnecessarily hurts performance, in particular given that each
+         * access causes a round trip on the VFIO-user socket.
+         *
+         * TODO: Investigate how to plumb larger accesses through memory
+         * regions, possibly by amending MemoryRegionOps or by creating a new
+         * memory region type.
+         */
+        memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion,
+                              name, iov->iov_len);
+    }
 
     dma_as = pci_device_iommu_address_space(o->pci_dev);