diff mbox series

[v26,13/17] vfio: create mapped iova list when vIOMMU is enabled

Message ID 1600817059-26721-14-git-send-email-kwankhede@nvidia.com
State New
Headers show
Series Add migration support for VFIO devices | expand

Commit Message

Kirti Wankhede Sept. 22, 2020, 11:24 p.m. UTC
Create mapped iova list when vIOMMU is enabled. For each mapped iova
save translated address. Add node to list on MAP and remove node from
list on UNMAP.
This list is used to track dirty pages during migration.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
---
 hw/vfio/common.c              | 58 ++++++++++++++++++++++++++++++++++++++-----
 include/hw/vfio/vfio-common.h |  8 ++++++
 2 files changed, 60 insertions(+), 6 deletions(-)

Comments

Alex Williamson Sept. 25, 2020, 10:23 p.m. UTC | #1
On Wed, 23 Sep 2020 04:54:15 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Create mapped iova list when vIOMMU is enabled. For each mapped iova
> save translated address. Add node to list on MAP and remove node from
> list on UNMAP.
> This list is used to track dirty pages during migration.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> ---
>  hw/vfio/common.c              | 58 ++++++++++++++++++++++++++++++++++++++-----
>  include/hw/vfio/vfio-common.h |  8 ++++++
>  2 files changed, 60 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index d4959c036dd1..dc56cded2d95 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -407,8 +407,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>  }
>  
>  /* Called with rcu_read_lock held.  */
> -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
> -                           bool *read_only)
> +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> +                               ram_addr_t *ram_addr, bool *read_only)
>  {
>      MemoryRegion *mr;
>      hwaddr xlat;
> @@ -439,8 +439,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
>          return false;
>      }
>  
> -    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> -    *read_only = !writable || mr->readonly;
> +    if (vaddr) {
> +        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> +    }
> +
> +    if (ram_addr) {
> +        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
> +    }
> +
> +    if (read_only) {
> +        *read_only = !writable || mr->readonly;
> +    }
>  
>      return true;
>  }
> @@ -450,7 +459,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>      VFIOContainer *container = giommu->container;
>      hwaddr iova = iotlb->iova + giommu->iommu_offset;
> -    bool read_only;
>      void *vaddr;
>      int ret;
>  
> @@ -466,7 +474,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      rcu_read_lock();
>  
>      if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> -        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
> +        ram_addr_t ram_addr;
> +        bool read_only;
> +
> +        if (!vfio_get_xlat_addr(iotlb, &vaddr, &ram_addr, &read_only)) {
>              goto out;
>          }
>          /*
> @@ -484,8 +495,28 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>                           "0x%"HWADDR_PRIx", %p) = %d (%m)",
>                           container, iova,
>                           iotlb->addr_mask + 1, vaddr, ret);
> +        } else {
> +            VFIOIovaRange *iova_range;
> +
> +            iova_range = g_malloc0(sizeof(*iova_range));
> +            iova_range->iova = iova;
> +            iova_range->size = iotlb->addr_mask + 1;
> +            iova_range->ram_addr = ram_addr;
> +
> +            QLIST_INSERT_HEAD(&giommu->iova_list, iova_range, next);
>          }
>      } else {
> +        VFIOIovaRange *iova_range, *tmp;
> +
> +        QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
> +            if (iova_range->iova >= iova &&
> +                iova_range->iova + iova_range->size <= iova +
> +                                                       iotlb->addr_mask + 1) {
> +                QLIST_REMOVE(iova_range, next);
> +                g_free(iova_range);
> +            }
> +        }
> +


This is some pretty serious overhead... can't we trigger a replay when
migration is enabled to build this information then?  We're looking at
potentially thousands of entries, so a list is probably also not a good
choice.  I don't think it's acceptable to incur this even when not
migrating (ie. the vast majority of the time).  Thanks,

Alex

>          ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
>          if (ret) {
>              error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> @@ -642,6 +673,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
>              g_free(giommu);
>              goto fail;
>          }
> +        QLIST_INIT(&giommu->iova_list);
>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>          memory_region_iommu_replay(giommu->iommu, &giommu->n);
>  
> @@ -740,6 +772,13 @@ static void vfio_listener_region_del(MemoryListener *listener,
>          QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
>              if (MEMORY_REGION(giommu->iommu) == section->mr &&
>                  giommu->n.start == section->offset_within_region) {
> +                VFIOIovaRange *iova_range, *tmp;
> +
> +                QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
> +                    QLIST_REMOVE(iova_range, next);
> +                    g_free(iova_range);
> +                }
> +
>                  memory_region_unregister_iommu_notifier(section->mr,
>                                                          &giommu->n);
>                  QLIST_REMOVE(giommu, giommu_next);
> @@ -1541,6 +1580,13 @@ static void vfio_disconnect_container(VFIOGroup *group)
>          QLIST_REMOVE(container, next);
>  
>          QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
> +            VFIOIovaRange *iova_range, *itmp;
> +
> +            QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, itmp) {
> +                QLIST_REMOVE(iova_range, next);
> +                g_free(iova_range);
> +            }
> +
>              memory_region_unregister_iommu_notifier(
>                      MEMORY_REGION(giommu->iommu), &giommu->n);
>              QLIST_REMOVE(giommu, giommu_next);
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 0a1651eda2d0..aa7524fe2cc5 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -89,11 +89,19 @@ typedef struct VFIOContainer {
>      QLIST_ENTRY(VFIOContainer) next;
>  } VFIOContainer;
>  
> +typedef struct VFIOIovaRange {
> +    hwaddr iova;
> +    size_t size;
> +    ram_addr_t ram_addr;
> +    QLIST_ENTRY(VFIOIovaRange) next;
> +} VFIOIovaRange;
> +
>  typedef struct VFIOGuestIOMMU {
>      VFIOContainer *container;
>      IOMMUMemoryRegion *iommu;
>      hwaddr iommu_offset;
>      IOMMUNotifier n;
> +    QLIST_HEAD(, VFIOIovaRange) iova_list;
>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
>  } VFIOGuestIOMMU;
>
Kirti Wankhede Oct. 19, 2020, 6:01 a.m. UTC | #2
On 9/26/2020 3:53 AM, Alex Williamson wrote:
> On Wed, 23 Sep 2020 04:54:15 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> Create mapped iova list when vIOMMU is enabled. For each mapped iova
>> save translated address. Add node to list on MAP and remove node from
>> list on UNMAP.
>> This list is used to track dirty pages during migration.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> ---
>>   hw/vfio/common.c              | 58 ++++++++++++++++++++++++++++++++++++++-----
>>   include/hw/vfio/vfio-common.h |  8 ++++++
>>   2 files changed, 60 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index d4959c036dd1..dc56cded2d95 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -407,8 +407,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>   }
>>   
>>   /* Called with rcu_read_lock held.  */
>> -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
>> -                           bool *read_only)
>> +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
>> +                               ram_addr_t *ram_addr, bool *read_only)
>>   {
>>       MemoryRegion *mr;
>>       hwaddr xlat;
>> @@ -439,8 +439,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
>>           return false;
>>       }
>>   
>> -    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
>> -    *read_only = !writable || mr->readonly;
>> +    if (vaddr) {
>> +        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
>> +    }
>> +
>> +    if (ram_addr) {
>> +        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
>> +    }
>> +
>> +    if (read_only) {
>> +        *read_only = !writable || mr->readonly;
>> +    }
>>   
>>       return true;
>>   }
>> @@ -450,7 +459,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>>       VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>       VFIOContainer *container = giommu->container;
>>       hwaddr iova = iotlb->iova + giommu->iommu_offset;
>> -    bool read_only;
>>       void *vaddr;
>>       int ret;
>>   
>> @@ -466,7 +474,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>>       rcu_read_lock();
>>   
>>       if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
>> -        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
>> +        ram_addr_t ram_addr;
>> +        bool read_only;
>> +
>> +        if (!vfio_get_xlat_addr(iotlb, &vaddr, &ram_addr, &read_only)) {
>>               goto out;
>>           }
>>           /*
>> @@ -484,8 +495,28 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>>                            "0x%"HWADDR_PRIx", %p) = %d (%m)",
>>                            container, iova,
>>                            iotlb->addr_mask + 1, vaddr, ret);
>> +        } else {
>> +            VFIOIovaRange *iova_range;
>> +
>> +            iova_range = g_malloc0(sizeof(*iova_range));
>> +            iova_range->iova = iova;
>> +            iova_range->size = iotlb->addr_mask + 1;
>> +            iova_range->ram_addr = ram_addr;
>> +
>> +            QLIST_INSERT_HEAD(&giommu->iova_list, iova_range, next);
>>           }
>>       } else {
>> +        VFIOIovaRange *iova_range, *tmp;
>> +
>> +        QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
>> +            if (iova_range->iova >= iova &&
>> +                iova_range->iova + iova_range->size <= iova +
>> +                                                       iotlb->addr_mask + 1) {
>> +                QLIST_REMOVE(iova_range, next);
>> +                g_free(iova_range);
>> +            }
>> +        }
>> +
> 
> 
> This is some pretty serious overhead... can't we trigger a replay when
> migration is enabled to build this information then? 

Are you suggesting to call memory_region_iommu_replay() before 
vfio_sync_dirty_bitmap(), which would call vfio_iommu_map_notify() where 
iova list of mapping is maintained? Then in the notifer check if 
migration_is_running() and container->dirty_pages_supported == true, 
then only create iova mapping tree? In this case how would we know that 
this is triggered by
vfio_sync_dirty_bitmap()
  -> memory_region_iommu_replay()
and we don't have to call vfio_dma_map()?

> We're looking at
> potentially thousands of entries, so a list is probably also not a good
> choice. 

Changing it to tree.

Thanks,
Kirti

  I don't think it's acceptable to incur this even when not
> migrating (ie. the vast majority of the time).  Thanks,
> 
> Alex
> 
>>           ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
>>           if (ret) {
>>               error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>> @@ -642,6 +673,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>               g_free(giommu);
>>               goto fail;
>>           }
>> +        QLIST_INIT(&giommu->iova_list);
>>           QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>>           memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>   
>> @@ -740,6 +772,13 @@ static void vfio_listener_region_del(MemoryListener *listener,
>>           QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
>>               if (MEMORY_REGION(giommu->iommu) == section->mr &&
>>                   giommu->n.start == section->offset_within_region) {
>> +                VFIOIovaRange *iova_range, *tmp;
>> +
>> +                QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
>> +                    QLIST_REMOVE(iova_range, next);
>> +                    g_free(iova_range);
>> +                }
>> +
>>                   memory_region_unregister_iommu_notifier(section->mr,
>>                                                           &giommu->n);
>>                   QLIST_REMOVE(giommu, giommu_next);
>> @@ -1541,6 +1580,13 @@ static void vfio_disconnect_container(VFIOGroup *group)
>>           QLIST_REMOVE(container, next);
>>   
>>           QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
>> +            VFIOIovaRange *iova_range, *itmp;
>> +
>> +            QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, itmp) {
>> +                QLIST_REMOVE(iova_range, next);
>> +                g_free(iova_range);
>> +            }
>> +
>>               memory_region_unregister_iommu_notifier(
>>                       MEMORY_REGION(giommu->iommu), &giommu->n);
>>               QLIST_REMOVE(giommu, giommu_next);
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 0a1651eda2d0..aa7524fe2cc5 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -89,11 +89,19 @@ typedef struct VFIOContainer {
>>       QLIST_ENTRY(VFIOContainer) next;
>>   } VFIOContainer;
>>   
>> +typedef struct VFIOIovaRange {
>> +    hwaddr iova;
>> +    size_t size;
>> +    ram_addr_t ram_addr;
>> +    QLIST_ENTRY(VFIOIovaRange) next;
>> +} VFIOIovaRange;
>> +
>>   typedef struct VFIOGuestIOMMU {
>>       VFIOContainer *container;
>>       IOMMUMemoryRegion *iommu;
>>       hwaddr iommu_offset;
>>       IOMMUNotifier n;
>> +    QLIST_HEAD(, VFIOIovaRange) iova_list;
>>       QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
>>   } VFIOGuestIOMMU;
>>   
>
Alex Williamson Oct. 19, 2020, 5:24 p.m. UTC | #3
On Mon, 19 Oct 2020 11:31:03 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 9/26/2020 3:53 AM, Alex Williamson wrote:
> > On Wed, 23 Sep 2020 04:54:15 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> Create mapped iova list when vIOMMU is enabled. For each mapped iova
> >> save translated address. Add node to list on MAP and remove node from
> >> list on UNMAP.
> >> This list is used to track dirty pages during migration.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> ---
> >>   hw/vfio/common.c              | 58 ++++++++++++++++++++++++++++++++++++++-----
> >>   include/hw/vfio/vfio-common.h |  8 ++++++
> >>   2 files changed, 60 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >> index d4959c036dd1..dc56cded2d95 100644
> >> --- a/hw/vfio/common.c
> >> +++ b/hw/vfio/common.c
> >> @@ -407,8 +407,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> >>   }
> >>   
> >>   /* Called with rcu_read_lock held.  */
> >> -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
> >> -                           bool *read_only)
> >> +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> >> +                               ram_addr_t *ram_addr, bool *read_only)
> >>   {
> >>       MemoryRegion *mr;
> >>       hwaddr xlat;
> >> @@ -439,8 +439,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
> >>           return false;
> >>       }
> >>   
> >> -    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> >> -    *read_only = !writable || mr->readonly;
> >> +    if (vaddr) {
> >> +        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> >> +    }
> >> +
> >> +    if (ram_addr) {
> >> +        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
> >> +    }
> >> +
> >> +    if (read_only) {
> >> +        *read_only = !writable || mr->readonly;
> >> +    }
> >>   
> >>       return true;
> >>   }
> >> @@ -450,7 +459,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> >>       VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> >>       VFIOContainer *container = giommu->container;
> >>       hwaddr iova = iotlb->iova + giommu->iommu_offset;
> >> -    bool read_only;
> >>       void *vaddr;
> >>       int ret;
> >>   
> >> @@ -466,7 +474,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> >>       rcu_read_lock();
> >>   
> >>       if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> >> -        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
> >> +        ram_addr_t ram_addr;
> >> +        bool read_only;
> >> +
> >> +        if (!vfio_get_xlat_addr(iotlb, &vaddr, &ram_addr, &read_only)) {
> >>               goto out;
> >>           }
> >>           /*
> >> @@ -484,8 +495,28 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> >>                            "0x%"HWADDR_PRIx", %p) = %d (%m)",
> >>                            container, iova,
> >>                            iotlb->addr_mask + 1, vaddr, ret);
> >> +        } else {
> >> +            VFIOIovaRange *iova_range;
> >> +
> >> +            iova_range = g_malloc0(sizeof(*iova_range));
> >> +            iova_range->iova = iova;
> >> +            iova_range->size = iotlb->addr_mask + 1;
> >> +            iova_range->ram_addr = ram_addr;
> >> +
> >> +            QLIST_INSERT_HEAD(&giommu->iova_list, iova_range, next);
> >>           }
> >>       } else {
> >> +        VFIOIovaRange *iova_range, *tmp;
> >> +
> >> +        QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
> >> +            if (iova_range->iova >= iova &&
> >> +                iova_range->iova + iova_range->size <= iova +
> >> +                                                       iotlb->addr_mask + 1) {
> >> +                QLIST_REMOVE(iova_range, next);
> >> +                g_free(iova_range);
> >> +            }
> >> +        }
> >> +  
> > 
> > 
> > This is some pretty serious overhead... can't we trigger a replay when
> > migration is enabled to build this information then?   
> 
> Are you suggesting to call memory_region_iommu_replay() before 
> vfio_sync_dirty_bitmap(), which would call vfio_iommu_map_notify() where 
> iova list of mapping is maintained? Then in the notifer check if 
> migration_is_running() and container->dirty_pages_supported == true, 
> then only create iova mapping tree? In this case how would we know that 
> this is triggered by
> vfio_sync_dirty_bitmap()
>   -> memory_region_iommu_replay()  
> and we don't have to call vfio_dma_map()?

memory_region_iommu_replay() calls a notifier of our choice, so we
could create a notifier specifically for creating this tree when dirty
logging is enabled.  Thanks,

Alex

> > We're looking at
> > potentially thousands of entries, so a list is probably also not a good
> > choice.   
> 
> Changing it to tree.
> 
> Thanks,
> Kirti
> 
>   I don't think it's acceptable to incur this even when not
> > migrating (ie. the vast majority of the time).  Thanks,
> > 
> > Alex
> >   
> >>           ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
> >>           if (ret) {
> >>               error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> >> @@ -642,6 +673,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
> >>               g_free(giommu);
> >>               goto fail;
> >>           }
> >> +        QLIST_INIT(&giommu->iova_list);
> >>           QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> >>           memory_region_iommu_replay(giommu->iommu, &giommu->n);
> >>   
> >> @@ -740,6 +772,13 @@ static void vfio_listener_region_del(MemoryListener *listener,
> >>           QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
> >>               if (MEMORY_REGION(giommu->iommu) == section->mr &&
> >>                   giommu->n.start == section->offset_within_region) {
> >> +                VFIOIovaRange *iova_range, *tmp;
> >> +
> >> +                QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
> >> +                    QLIST_REMOVE(iova_range, next);
> >> +                    g_free(iova_range);
> >> +                }
> >> +
> >>                   memory_region_unregister_iommu_notifier(section->mr,
> >>                                                           &giommu->n);
> >>                   QLIST_REMOVE(giommu, giommu_next);
> >> @@ -1541,6 +1580,13 @@ static void vfio_disconnect_container(VFIOGroup *group)
> >>           QLIST_REMOVE(container, next);
> >>   
> >>           QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
> >> +            VFIOIovaRange *iova_range, *itmp;
> >> +
> >> +            QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, itmp) {
> >> +                QLIST_REMOVE(iova_range, next);
> >> +                g_free(iova_range);
> >> +            }
> >> +
> >>               memory_region_unregister_iommu_notifier(
> >>                       MEMORY_REGION(giommu->iommu), &giommu->n);
> >>               QLIST_REMOVE(giommu, giommu_next);
> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >> index 0a1651eda2d0..aa7524fe2cc5 100644
> >> --- a/include/hw/vfio/vfio-common.h
> >> +++ b/include/hw/vfio/vfio-common.h
> >> @@ -89,11 +89,19 @@ typedef struct VFIOContainer {
> >>       QLIST_ENTRY(VFIOContainer) next;
> >>   } VFIOContainer;
> >>   
> >> +typedef struct VFIOIovaRange {
> >> +    hwaddr iova;
> >> +    size_t size;
> >> +    ram_addr_t ram_addr;
> >> +    QLIST_ENTRY(VFIOIovaRange) next;
> >> +} VFIOIovaRange;
> >> +
> >>   typedef struct VFIOGuestIOMMU {
> >>       VFIOContainer *container;
> >>       IOMMUMemoryRegion *iommu;
> >>       hwaddr iommu_offset;
> >>       IOMMUNotifier n;
> >> +    QLIST_HEAD(, VFIOIovaRange) iova_list;
> >>       QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> >>   } VFIOGuestIOMMU;
> >>     
> >   
>
Kirti Wankhede Oct. 19, 2020, 7:15 p.m. UTC | #4
On 10/19/2020 10:54 PM, Alex Williamson wrote:
> On Mon, 19 Oct 2020 11:31:03 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 9/26/2020 3:53 AM, Alex Williamson wrote:
>>> On Wed, 23 Sep 2020 04:54:15 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>    
>>>> Create mapped iova list when vIOMMU is enabled. For each mapped iova
>>>> save translated address. Add node to list on MAP and remove node from
>>>> list on UNMAP.
>>>> This list is used to track dirty pages during migration.
>>>>
>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>> ---
>>>>    hw/vfio/common.c              | 58 ++++++++++++++++++++++++++++++++++++++-----
>>>>    include/hw/vfio/vfio-common.h |  8 ++++++
>>>>    2 files changed, 60 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index d4959c036dd1..dc56cded2d95 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -407,8 +407,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>>>    }
>>>>    
>>>>    /* Called with rcu_read_lock held.  */
>>>> -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
>>>> -                           bool *read_only)
>>>> +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
>>>> +                               ram_addr_t *ram_addr, bool *read_only)
>>>>    {
>>>>        MemoryRegion *mr;
>>>>        hwaddr xlat;
>>>> @@ -439,8 +439,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
>>>>            return false;
>>>>        }
>>>>    
>>>> -    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
>>>> -    *read_only = !writable || mr->readonly;
>>>> +    if (vaddr) {
>>>> +        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
>>>> +    }
>>>> +
>>>> +    if (ram_addr) {
>>>> +        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
>>>> +    }
>>>> +
>>>> +    if (read_only) {
>>>> +        *read_only = !writable || mr->readonly;
>>>> +    }
>>>>    
>>>>        return true;
>>>>    }
>>>> @@ -450,7 +459,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>>>>        VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>        VFIOContainer *container = giommu->container;
>>>>        hwaddr iova = iotlb->iova + giommu->iommu_offset;
>>>> -    bool read_only;
>>>>        void *vaddr;
>>>>        int ret;
>>>>    
>>>> @@ -466,7 +474,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>>>>        rcu_read_lock();
>>>>    
>>>>        if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
>>>> -        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
>>>> +        ram_addr_t ram_addr;
>>>> +        bool read_only;
>>>> +
>>>> +        if (!vfio_get_xlat_addr(iotlb, &vaddr, &ram_addr, &read_only)) {
>>>>                goto out;
>>>>            }
>>>>            /*
>>>> @@ -484,8 +495,28 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>>>>                             "0x%"HWADDR_PRIx", %p) = %d (%m)",
>>>>                             container, iova,
>>>>                             iotlb->addr_mask + 1, vaddr, ret);
>>>> +        } else {
>>>> +            VFIOIovaRange *iova_range;
>>>> +
>>>> +            iova_range = g_malloc0(sizeof(*iova_range));
>>>> +            iova_range->iova = iova;
>>>> +            iova_range->size = iotlb->addr_mask + 1;
>>>> +            iova_range->ram_addr = ram_addr;
>>>> +
>>>> +            QLIST_INSERT_HEAD(&giommu->iova_list, iova_range, next);
>>>>            }
>>>>        } else {
>>>> +        VFIOIovaRange *iova_range, *tmp;
>>>> +
>>>> +        QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
>>>> +            if (iova_range->iova >= iova &&
>>>> +                iova_range->iova + iova_range->size <= iova +
>>>> +                                                       iotlb->addr_mask + 1) {
>>>> +                QLIST_REMOVE(iova_range, next);
>>>> +                g_free(iova_range);
>>>> +            }
>>>> +        }
>>>> +
>>>
>>>
>>> This is some pretty serious overhead... can't we trigger a replay when
>>> migration is enabled to build this information then?
>>
>> Are you suggesting to call memory_region_iommu_replay() before
>> vfio_sync_dirty_bitmap(), which would call vfio_iommu_map_notify() where
>> iova list of mapping is maintained? Then in the notifer check if
>> migration_is_running() and container->dirty_pages_supported == true,
>> then only create iova mapping tree? In this case how would we know that
>> this is triggered by
>> vfio_sync_dirty_bitmap()
>>    -> memory_region_iommu_replay()
>> and we don't have to call vfio_dma_map()?
> 
> memory_region_iommu_replay() calls a notifier of our choice, so we
> could create a notifier specifically for creating this tree when dirty
> logging is enabled.  Thanks,
> 

This would also mean changes in intel_iommu.c such that it would walk 
through the iova_tree and call notifier for each entry in iova_tree. 
What about other platforms? We will have to handle such cases for AMD, 
ARM, PPC etc...?
I don't see replay callback for AMD, that would result in minimum IOMMU 
supported page size granularity walk - which is similar to that I tried 
to implement 2-3 versions back.
Does that mean doing such change would improve performance for Intel 
IOMMU but worsen for AMD/PPC?

I'm changing list to tree as first level of improvement in this patch.

Can we do the change you suggested above later as next level of improvement?

Thanks,
Kirti
Alex Williamson Oct. 19, 2020, 8:07 p.m. UTC | #5
On Tue, 20 Oct 2020 00:45:28 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 10/19/2020 10:54 PM, Alex Williamson wrote:
> > On Mon, 19 Oct 2020 11:31:03 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 9/26/2020 3:53 AM, Alex Williamson wrote:  
> >>> On Wed, 23 Sep 2020 04:54:15 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>      
> >>>> Create mapped iova list when vIOMMU is enabled. For each mapped iova
> >>>> save translated address. Add node to list on MAP and remove node from
> >>>> list on UNMAP.
> >>>> This list is used to track dirty pages during migration.
> >>>>
> >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>> ---
> >>>>    hw/vfio/common.c              | 58 ++++++++++++++++++++++++++++++++++++++-----
> >>>>    include/hw/vfio/vfio-common.h |  8 ++++++
> >>>>    2 files changed, 60 insertions(+), 6 deletions(-)
> >>>>
> >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>>> index d4959c036dd1..dc56cded2d95 100644
> >>>> --- a/hw/vfio/common.c
> >>>> +++ b/hw/vfio/common.c
> >>>> @@ -407,8 +407,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> >>>>    }
> >>>>    
> >>>>    /* Called with rcu_read_lock held.  */
> >>>> -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
> >>>> -                           bool *read_only)
> >>>> +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> >>>> +                               ram_addr_t *ram_addr, bool *read_only)
> >>>>    {
> >>>>        MemoryRegion *mr;
> >>>>        hwaddr xlat;
> >>>> @@ -439,8 +439,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
> >>>>            return false;
> >>>>        }
> >>>>    
> >>>> -    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> >>>> -    *read_only = !writable || mr->readonly;
> >>>> +    if (vaddr) {
> >>>> +        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> >>>> +    }
> >>>> +
> >>>> +    if (ram_addr) {
> >>>> +        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
> >>>> +    }
> >>>> +
> >>>> +    if (read_only) {
> >>>> +        *read_only = !writable || mr->readonly;
> >>>> +    }
> >>>>    
> >>>>        return true;
> >>>>    }
> >>>> @@ -450,7 +459,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> >>>>        VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> >>>>        VFIOContainer *container = giommu->container;
> >>>>        hwaddr iova = iotlb->iova + giommu->iommu_offset;
> >>>> -    bool read_only;
> >>>>        void *vaddr;
> >>>>        int ret;
> >>>>    
> >>>> @@ -466,7 +474,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> >>>>        rcu_read_lock();
> >>>>    
> >>>>        if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> >>>> -        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
> >>>> +        ram_addr_t ram_addr;
> >>>> +        bool read_only;
> >>>> +
> >>>> +        if (!vfio_get_xlat_addr(iotlb, &vaddr, &ram_addr, &read_only)) {
> >>>>                goto out;
> >>>>            }
> >>>>            /*
> >>>> @@ -484,8 +495,28 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> >>>>                             "0x%"HWADDR_PRIx", %p) = %d (%m)",
> >>>>                             container, iova,
> >>>>                             iotlb->addr_mask + 1, vaddr, ret);
> >>>> +        } else {
> >>>> +            VFIOIovaRange *iova_range;
> >>>> +
> >>>> +            iova_range = g_malloc0(sizeof(*iova_range));
> >>>> +            iova_range->iova = iova;
> >>>> +            iova_range->size = iotlb->addr_mask + 1;
> >>>> +            iova_range->ram_addr = ram_addr;
> >>>> +
> >>>> +            QLIST_INSERT_HEAD(&giommu->iova_list, iova_range, next);
> >>>>            }
> >>>>        } else {
> >>>> +        VFIOIovaRange *iova_range, *tmp;
> >>>> +
> >>>> +        QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
> >>>> +            if (iova_range->iova >= iova &&
> >>>> +                iova_range->iova + iova_range->size <= iova +
> >>>> +                                                       iotlb->addr_mask + 1) {
> >>>> +                QLIST_REMOVE(iova_range, next);
> >>>> +                g_free(iova_range);
> >>>> +            }
> >>>> +        }
> >>>> +  
> >>>
> >>>
> >>> This is some pretty serious overhead... can't we trigger a replay when
> >>> migration is enabled to build this information then?  
> >>
> >> Are you suggesting to call memory_region_iommu_replay() before
> >> vfio_sync_dirty_bitmap(), which would call vfio_iommu_map_notify() where
> >> iova list of mapping is maintained? Then in the notifer check if
> >> migration_is_running() and container->dirty_pages_supported == true,
> >> then only create iova mapping tree? In this case how would we know that
> >> this is triggered by
> >> vfio_sync_dirty_bitmap()  
> >>    -> memory_region_iommu_replay()  
> >> and we don't have to call vfio_dma_map()?  
> > 
> > memory_region_iommu_replay() calls a notifier of our choice, so we
> > could create a notifier specifically for creating this tree when dirty
> > logging is enabled.  Thanks,
> >   
> 
> This would also mean changes in intel_iommu.c such that it would walk 
> through the iova_tree and call notifier for each entry in iova_tree.

I think we already have that in vtd_iommu_replay(), an
IOMMUMemoryRegionClass.replay callback is rather a requirement of any
vIOMMU intending to support vfio AIUI.
 
> What about other platforms? We will have to handle such cases for
> AMD, ARM, PPC etc...?

There's already a requirement for a working replay callback to work in
any reasonable way with vfio, this is just an additional use case of a
callback we already need and use.

> I don't see replay callback for AMD, that would result in minimum
> IOMMU supported page size granularity walk - which is similar to that
> I tried to implement 2-3 versions back.

Patch 1/3:
https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg00545.html
Patch 5/10:
https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg02196.html

> Does that mean doing such change would improve performance for Intel 
> IOMMU but worsen for AMD/PPC?

We're not adding a new requirement, we already call replay, PPC doesn't
use type1.  What exactly regresses if we introduce another replay user?

> I'm changing list to tree as first level of improvement in this patch.
> 
> Can we do the change you suggested above later as next level of
> improvement?

AIUI above, we're allocating an object and adding it to a list (soon to
be tree) for every vIOMMU mapping, on the off chance that migration
might be used, regardless of devices even supporting migration.  I can
only see that as a runtime performance and size regression.  Thanks,

Alex
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index d4959c036dd1..dc56cded2d95 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -407,8 +407,8 @@  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 }
 
 /* Called with rcu_read_lock held.  */
-static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
-                           bool *read_only)
+static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
+                               ram_addr_t *ram_addr, bool *read_only)
 {
     MemoryRegion *mr;
     hwaddr xlat;
@@ -439,8 +439,17 @@  static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
         return false;
     }
 
-    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
-    *read_only = !writable || mr->readonly;
+    if (vaddr) {
+        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
+    }
+
+    if (ram_addr) {
+        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
+    }
+
+    if (read_only) {
+        *read_only = !writable || mr->readonly;
+    }
 
     return true;
 }
@@ -450,7 +459,6 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
     VFIOContainer *container = giommu->container;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
-    bool read_only;
     void *vaddr;
     int ret;
 
@@ -466,7 +474,10 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     rcu_read_lock();
 
     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
-        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
+        ram_addr_t ram_addr;
+        bool read_only;
+
+        if (!vfio_get_xlat_addr(iotlb, &vaddr, &ram_addr, &read_only)) {
             goto out;
         }
         /*
@@ -484,8 +495,28 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
                          "0x%"HWADDR_PRIx", %p) = %d (%m)",
                          container, iova,
                          iotlb->addr_mask + 1, vaddr, ret);
+        } else {
+            VFIOIovaRange *iova_range;
+
+            iova_range = g_malloc0(sizeof(*iova_range));
+            iova_range->iova = iova;
+            iova_range->size = iotlb->addr_mask + 1;
+            iova_range->ram_addr = ram_addr;
+
+            QLIST_INSERT_HEAD(&giommu->iova_list, iova_range, next);
         }
     } else {
+        VFIOIovaRange *iova_range, *tmp;
+
+        QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
+            if (iova_range->iova >= iova &&
+                iova_range->iova + iova_range->size <= iova +
+                                                       iotlb->addr_mask + 1) {
+                QLIST_REMOVE(iova_range, next);
+                g_free(iova_range);
+            }
+        }
+
         ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
@@ -642,6 +673,7 @@  static void vfio_listener_region_add(MemoryListener *listener,
             g_free(giommu);
             goto fail;
         }
+        QLIST_INIT(&giommu->iova_list);
         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
         memory_region_iommu_replay(giommu->iommu, &giommu->n);
 
@@ -740,6 +772,13 @@  static void vfio_listener_region_del(MemoryListener *listener,
         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
             if (MEMORY_REGION(giommu->iommu) == section->mr &&
                 giommu->n.start == section->offset_within_region) {
+                VFIOIovaRange *iova_range, *tmp;
+
+                QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
+                    QLIST_REMOVE(iova_range, next);
+                    g_free(iova_range);
+                }
+
                 memory_region_unregister_iommu_notifier(section->mr,
                                                         &giommu->n);
                 QLIST_REMOVE(giommu, giommu_next);
@@ -1541,6 +1580,13 @@  static void vfio_disconnect_container(VFIOGroup *group)
         QLIST_REMOVE(container, next);
 
         QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
+            VFIOIovaRange *iova_range, *itmp;
+
+            QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, itmp) {
+                QLIST_REMOVE(iova_range, next);
+                g_free(iova_range);
+            }
+
             memory_region_unregister_iommu_notifier(
                     MEMORY_REGION(giommu->iommu), &giommu->n);
             QLIST_REMOVE(giommu, giommu_next);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 0a1651eda2d0..aa7524fe2cc5 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -89,11 +89,19 @@  typedef struct VFIOContainer {
     QLIST_ENTRY(VFIOContainer) next;
 } VFIOContainer;
 
+typedef struct VFIOIovaRange {
+    hwaddr iova;
+    size_t size;
+    ram_addr_t ram_addr;
+    QLIST_ENTRY(VFIOIovaRange) next;
+} VFIOIovaRange;
+
 typedef struct VFIOGuestIOMMU {
     VFIOContainer *container;
     IOMMUMemoryRegion *iommu;
     hwaddr iommu_offset;
     IOMMUNotifier n;
+    QLIST_HEAD(, VFIOIovaRange) iova_list;
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 } VFIOGuestIOMMU;