diff mbox series

[QEMU,v25,14/17] vfio: Add vfio_listener_log_sync to mark dirty pages

Message ID 1592684486-18511-15-git-send-email-kwankhede@nvidia.com
State New
Headers show
Series Add migration support for VFIO devices | expand

Commit Message

Kirti Wankhede June 20, 2020, 8:21 p.m. UTC
vfio_listener_log_sync gets list of dirty pages from container using
VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all
devices are stopped and saving state.
Return early for the RAM block section of mapped MMIO region.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 hw/vfio/common.c     | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events |   1 +
 2 files changed, 131 insertions(+)

Comments

Alex Williamson June 24, 2020, 6:55 p.m. UTC | #1
On Sun, 21 Jun 2020 01:51:23 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> vfio_listener_log_sync gets list of dirty pages from container using
> VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all
> devices are stopped and saving state.
> Return early for the RAM block section of mapped MMIO region.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  hw/vfio/common.c     | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events |   1 +
>  2 files changed, 131 insertions(+)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 6921a78e9ba5..0518cf228ed5 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -29,6 +29,7 @@
>  #include "hw/vfio/vfio.h"
>  #include "exec/address-spaces.h"
>  #include "exec/memory.h"
> +#include "exec/ram_addr.h"
>  #include "hw/hw.h"
>  #include "qemu/error-report.h"
>  #include "qemu/main-loop.h"
> @@ -38,6 +39,7 @@
>  #include "sysemu/reset.h"
>  #include "trace.h"
>  #include "qapi/error.h"
> +#include "migration/migration.h"
>  
>  VFIOGroupList vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
> @@ -288,6 +290,28 @@ const MemoryRegionOps vfio_region_ops = {
>  };
>  
>  /*
> + * Device state interfaces
> + */
> +
> +static bool vfio_devices_are_stopped_and_saving(void)
> +{
> +    VFIOGroup *group;
> +    VFIODevice *vbasedev;
> +
> +    QLIST_FOREACH(group, &vfio_group_list, next) {

Should this be passed the container in order to iterate
container->group_list?

> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
> +                !(vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
> +                continue;
> +            } else {
> +                return false;
> +            }
> +        }
> +    }
> +    return true;
> +}
> +
> +/*
>   * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>   */
>  static int vfio_dma_unmap(VFIOContainer *container,
> @@ -852,9 +876,115 @@ static void vfio_listener_region_del(MemoryListener *listener,
>      }
>  }
>  
> +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
> +                                 uint64_t size, ram_addr_t ram_addr)
> +{
> +    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
> +    struct vfio_iommu_type1_dirty_bitmap_get *range;
> +    uint64_t pages;
> +    int ret;
> +
> +    dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
> +
> +    dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
> +    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> +    range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
> +    range->iova = iova;
> +    range->size = size;
> +
> +    /*
> +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
> +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to
> +     * TARGET_PAGE_SIZE.
> +     */
> +    range->bitmap.pgsize = TARGET_PAGE_SIZE;
> +
> +    pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS;
> +    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
> +                                         BITS_PER_BYTE;
> +    range->bitmap.data = g_try_malloc0(range->bitmap.size);
> +    if (!range->bitmap.data) {
> +        ret = -ENOMEM;
> +        goto err_out;
> +    }
> +
> +    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
> +    if (ret) {
> +        error_report("Failed to get dirty bitmap for iova: 0x%llx "
> +                "size: 0x%llx err: %d",
> +                range->iova, range->size, errno);
> +        goto err_out;
> +    }
> +
> +    cpu_physical_memory_set_dirty_lebitmap((uint64_t *)range->bitmap.data,
> +                                            ram_addr, pages);
> +
> +    trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
> +                                range->bitmap.size, ram_addr);
> +err_out:
> +    g_free(range->bitmap.data);
> +    g_free(dbitmap);
> +
> +    return ret;
> +}
> +
> +static int vfio_sync_dirty_bitmap(MemoryListener *listener,
> +                                 MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
> +    VFIOGuestIOMMU *giommu = NULL;
> +    ram_addr_t ram_addr;
> +    uint64_t iova, size;
> +    int ret = 0;
> +
> +    if (memory_region_is_iommu(section->mr)) {
> +
> +        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
> +            if (MEMORY_REGION(giommu->iommu) == section->mr &&
> +                giommu->n.start == section->offset_within_region) {
> +                VFIOIovaRange *iova_range;
> +
> +                QLIST_FOREACH(iova_range, &giommu->iova_list, next) {
> +                    ret = vfio_get_dirty_bitmap(container, iova_range->iova,
> +                                        iova_range->size, iova_range->ram_addr);
> +                    if (ret) {
> +                        break;
> +                    }
> +                }
> +                break;
> +            }
> +        }
> +
> +    } else {
> +        iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +        size = int128_get64(section->size);
> +
> +        ram_addr = memory_region_get_ram_addr(section->mr) +
> +                   section->offset_within_region + iova -
> +                   TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +
> +        ret = vfio_get_dirty_bitmap(container, iova, size, ram_addr);
> +    }
> +
> +    return ret;
> +}
> +
> +static void vfio_listerner_log_sync(MemoryListener *listener,
> +        MemoryRegionSection *section)
> +{
> +    if (vfio_listener_skipped_section(section)) {
> +        return;
> +    }
> +
> +    if (vfio_devices_are_stopped_and_saving()) {
> +        vfio_sync_dirty_bitmap(listener, section);
> +    }


How do we decide that this is the best policy for all devices?  For
example if a device does not support page pinning or some future means
of marking dirtied pages, this is clearly the right choice, but when
these are supported, aren't we deferring all dirty logging info until
the final stage?  Thanks,

Alex

> +}
> +
>  static const MemoryListener vfio_memory_listener = {
>      .region_add = vfio_listener_region_add,
>      .region_del = vfio_listener_region_del,
> +    .log_sync = vfio_listerner_log_sync,
>  };
>  
>  static void vfio_listener_release(VFIOContainer *container)
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 4a4bd3ba9a2a..c61ae4f3ead8 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -160,3 +160,4 @@ vfio_save_complete_precopy(const char *name) " (%s)"
>  vfio_load_device_config_state(const char *name) " (%s)"
>  vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
>  vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
> +vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
Kirti Wankhede June 25, 2020, 2:43 p.m. UTC | #2
On 6/25/2020 12:25 AM, Alex Williamson wrote:
> On Sun, 21 Jun 2020 01:51:23 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> vfio_listener_log_sync gets list of dirty pages from container using
>> VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all
>> devices are stopped and saving state.
>> Return early for the RAM block section of mapped MMIO region.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   hw/vfio/common.c     | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   hw/vfio/trace-events |   1 +
>>   2 files changed, 131 insertions(+)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 6921a78e9ba5..0518cf228ed5 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -29,6 +29,7 @@
>>   #include "hw/vfio/vfio.h"
>>   #include "exec/address-spaces.h"
>>   #include "exec/memory.h"
>> +#include "exec/ram_addr.h"
>>   #include "hw/hw.h"
>>   #include "qemu/error-report.h"
>>   #include "qemu/main-loop.h"
>> @@ -38,6 +39,7 @@
>>   #include "sysemu/reset.h"
>>   #include "trace.h"
>>   #include "qapi/error.h"
>> +#include "migration/migration.h"
>>   
>>   VFIOGroupList vfio_group_list =
>>       QLIST_HEAD_INITIALIZER(vfio_group_list);
>> @@ -288,6 +290,28 @@ const MemoryRegionOps vfio_region_ops = {
>>   };
>>   
>>   /*
>> + * Device state interfaces
>> + */
>> +
>> +static bool vfio_devices_are_stopped_and_saving(void)
>> +{
>> +    VFIOGroup *group;
>> +    VFIODevice *vbasedev;
>> +
>> +    QLIST_FOREACH(group, &vfio_group_list, next) {
> 
> Should this be passed the container in order to iterate
> container->group_list?
> 
>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>> +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
>> +                !(vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
>> +                continue;
>> +            } else {
>> +                return false;
>> +            }
>> +        }
>> +    }
>> +    return true;
>> +}
>> +
>> +/*
>>    * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>>    */
>>   static int vfio_dma_unmap(VFIOContainer *container,
>> @@ -852,9 +876,115 @@ static void vfio_listener_region_del(MemoryListener *listener,
>>       }
>>   }
>>   
>> +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
>> +                                 uint64_t size, ram_addr_t ram_addr)
>> +{
>> +    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
>> +    struct vfio_iommu_type1_dirty_bitmap_get *range;
>> +    uint64_t pages;
>> +    int ret;
>> +
>> +    dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
>> +
>> +    dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
>> +    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>> +    range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
>> +    range->iova = iova;
>> +    range->size = size;
>> +
>> +    /*
>> +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
>> +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to
>> +     * TARGET_PAGE_SIZE.
>> +     */
>> +    range->bitmap.pgsize = TARGET_PAGE_SIZE;
>> +
>> +    pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS;
>> +    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
>> +                                         BITS_PER_BYTE;
>> +    range->bitmap.data = g_try_malloc0(range->bitmap.size);
>> +    if (!range->bitmap.data) {
>> +        ret = -ENOMEM;
>> +        goto err_out;
>> +    }
>> +
>> +    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
>> +    if (ret) {
>> +        error_report("Failed to get dirty bitmap for iova: 0x%llx "
>> +                "size: 0x%llx err: %d",
>> +                range->iova, range->size, errno);
>> +        goto err_out;
>> +    }
>> +
>> +    cpu_physical_memory_set_dirty_lebitmap((uint64_t *)range->bitmap.data,
>> +                                            ram_addr, pages);
>> +
>> +    trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
>> +                                range->bitmap.size, ram_addr);
>> +err_out:
>> +    g_free(range->bitmap.data);
>> +    g_free(dbitmap);
>> +
>> +    return ret;
>> +}
>> +
>> +static int vfio_sync_dirty_bitmap(MemoryListener *listener,
>> +                                 MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>> +    VFIOGuestIOMMU *giommu = NULL;
>> +    ram_addr_t ram_addr;
>> +    uint64_t iova, size;
>> +    int ret = 0;
>> +
>> +    if (memory_region_is_iommu(section->mr)) {
>> +
>> +        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
>> +            if (MEMORY_REGION(giommu->iommu) == section->mr &&
>> +                giommu->n.start == section->offset_within_region) {
>> +                VFIOIovaRange *iova_range;
>> +
>> +                QLIST_FOREACH(iova_range, &giommu->iova_list, next) {
>> +                    ret = vfio_get_dirty_bitmap(container, iova_range->iova,
>> +                                        iova_range->size, iova_range->ram_addr);
>> +                    if (ret) {
>> +                        break;
>> +                    }
>> +                }
>> +                break;
>> +            }
>> +        }
>> +
>> +    } else {
>> +        iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> +        size = int128_get64(section->size);
>> +
>> +        ram_addr = memory_region_get_ram_addr(section->mr) +
>> +                   section->offset_within_region + iova -
>> +                   TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> +
>> +        ret = vfio_get_dirty_bitmap(container, iova, size, ram_addr);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static void vfio_listerner_log_sync(MemoryListener *listener,
>> +        MemoryRegionSection *section)
>> +{
>> +    if (vfio_listener_skipped_section(section)) {
>> +        return;
>> +    }
>> +
>> +    if (vfio_devices_are_stopped_and_saving()) {
>> +        vfio_sync_dirty_bitmap(listener, section);
>> +    }
> 
> 
> How do we decide that this is the best policy for all devices?  For
> example if a device does not support page pinning or some future means
> of marking dirtied pages, this is clearly the right choice, but when
> these are supported, aren't we deferring all dirty logging info until
> the final stage?  Thanks,
> 

Yes, for now we are deferring all dirty logging to stop-and-copy phase. 
In future, whenever hardware support for dirty page tracking get added, 
we will have flag added in migration capability in VFIO_IOMMU_GET_INFO 
capability list. Based on that we can decide to get dirty pages in 
earlier phase of migration.

Thanks,
Kirti
Alex Williamson June 25, 2020, 5:57 p.m. UTC | #3
On Thu, 25 Jun 2020 20:13:39 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 6/25/2020 12:25 AM, Alex Williamson wrote:
> > On Sun, 21 Jun 2020 01:51:23 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> vfio_listener_log_sync gets list of dirty pages from container using
> >> VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all
> >> devices are stopped and saving state.
> >> Return early for the RAM block section of mapped MMIO region.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   hw/vfio/common.c     | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>   hw/vfio/trace-events |   1 +
> >>   2 files changed, 131 insertions(+)
> >>
> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >> index 6921a78e9ba5..0518cf228ed5 100644
> >> --- a/hw/vfio/common.c
> >> +++ b/hw/vfio/common.c
> >> @@ -29,6 +29,7 @@
> >>   #include "hw/vfio/vfio.h"
> >>   #include "exec/address-spaces.h"
> >>   #include "exec/memory.h"
> >> +#include "exec/ram_addr.h"
> >>   #include "hw/hw.h"
> >>   #include "qemu/error-report.h"
> >>   #include "qemu/main-loop.h"
> >> @@ -38,6 +39,7 @@
> >>   #include "sysemu/reset.h"
> >>   #include "trace.h"
> >>   #include "qapi/error.h"
> >> +#include "migration/migration.h"
> >>   
> >>   VFIOGroupList vfio_group_list =
> >>       QLIST_HEAD_INITIALIZER(vfio_group_list);
> >> @@ -288,6 +290,28 @@ const MemoryRegionOps vfio_region_ops = {
> >>   };
> >>   
> >>   /*
> >> + * Device state interfaces
> >> + */
> >> +
> >> +static bool vfio_devices_are_stopped_and_saving(void)
> >> +{
> >> +    VFIOGroup *group;
> >> +    VFIODevice *vbasedev;
> >> +
> >> +    QLIST_FOREACH(group, &vfio_group_list, next) {  
> > 
> > Should this be passed the container in order to iterate
> > container->group_list?
> >   
> >> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> >> +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
> >> +                !(vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
> >> +                continue;
> >> +            } else {
> >> +                return false;
> >> +            }
> >> +        }
> >> +    }
> >> +    return true;
> >> +}
> >> +
> >> +/*
> >>    * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> >>    */
> >>   static int vfio_dma_unmap(VFIOContainer *container,
> >> @@ -852,9 +876,115 @@ static void vfio_listener_region_del(MemoryListener *listener,
> >>       }
> >>   }
> >>   
> >> +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
> >> +                                 uint64_t size, ram_addr_t ram_addr)
> >> +{
> >> +    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
> >> +    struct vfio_iommu_type1_dirty_bitmap_get *range;
> >> +    uint64_t pages;
> >> +    int ret;
> >> +
> >> +    dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
> >> +
> >> +    dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
> >> +    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >> +    range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
> >> +    range->iova = iova;
> >> +    range->size = size;
> >> +
> >> +    /*
> >> +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
> >> +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to
> >> +     * TARGET_PAGE_SIZE.
> >> +     */
> >> +    range->bitmap.pgsize = TARGET_PAGE_SIZE;
> >> +
> >> +    pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS;
> >> +    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
> >> +                                         BITS_PER_BYTE;
> >> +    range->bitmap.data = g_try_malloc0(range->bitmap.size);
> >> +    if (!range->bitmap.data) {
> >> +        ret = -ENOMEM;
> >> +        goto err_out;
> >> +    }
> >> +
> >> +    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
> >> +    if (ret) {
> >> +        error_report("Failed to get dirty bitmap for iova: 0x%llx "
> >> +                "size: 0x%llx err: %d",
> >> +                range->iova, range->size, errno);
> >> +        goto err_out;
> >> +    }
> >> +
> >> +    cpu_physical_memory_set_dirty_lebitmap((uint64_t *)range->bitmap.data,
> >> +                                            ram_addr, pages);
> >> +
> >> +    trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
> >> +                                range->bitmap.size, ram_addr);
> >> +err_out:
> >> +    g_free(range->bitmap.data);
> >> +    g_free(dbitmap);
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static int vfio_sync_dirty_bitmap(MemoryListener *listener,
> >> +                                 MemoryRegionSection *section)
> >> +{
> >> +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
> >> +    VFIOGuestIOMMU *giommu = NULL;
> >> +    ram_addr_t ram_addr;
> >> +    uint64_t iova, size;
> >> +    int ret = 0;
> >> +
> >> +    if (memory_region_is_iommu(section->mr)) {
> >> +
> >> +        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
> >> +            if (MEMORY_REGION(giommu->iommu) == section->mr &&
> >> +                giommu->n.start == section->offset_within_region) {
> >> +                VFIOIovaRange *iova_range;
> >> +
> >> +                QLIST_FOREACH(iova_range, &giommu->iova_list, next) {
> >> +                    ret = vfio_get_dirty_bitmap(container, iova_range->iova,
> >> +                                        iova_range->size, iova_range->ram_addr);
> >> +                    if (ret) {
> >> +                        break;
> >> +                    }
> >> +                }
> >> +                break;
> >> +            }
> >> +        }
> >> +
> >> +    } else {
> >> +        iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> >> +        size = int128_get64(section->size);
> >> +
> >> +        ram_addr = memory_region_get_ram_addr(section->mr) +
> >> +                   section->offset_within_region + iova -
> >> +                   TARGET_PAGE_ALIGN(section->offset_within_address_space);
> >> +
> >> +        ret = vfio_get_dirty_bitmap(container, iova, size, ram_addr);
> >> +    }
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static void vfio_listerner_log_sync(MemoryListener *listener,
> >> +        MemoryRegionSection *section)
> >> +{
> >> +    if (vfio_listener_skipped_section(section)) {
> >> +        return;
> >> +    }
> >> +
> >> +    if (vfio_devices_are_stopped_and_saving()) {
> >> +        vfio_sync_dirty_bitmap(listener, section);
> >> +    }  
> > 
> > 
> > How do we decide that this is the best policy for all devices?  For
> > example if a device does not support page pinning or some future means
> > of marking dirtied pages, this is clearly the right choice, but when
> > these are supported, aren't we deferring all dirty logging info until
> > the final stage?  Thanks,
> >   
> 
> Yes, for now we are deferring all dirty logging to stop-and-copy phase. 
> In future, whenever hardware support for dirty page tracking get added, 
> we will have flag added in migration capability in VFIO_IOMMU_GET_INFO 
> capability list. Based on that we can decide to get dirty pages in 
> earlier phase of migration.

So the flag that you're expecting to add would indicate that the IOMMU
is reporting actual page dirtying, not just assuming pinned pages are
dirty, as we have now.  It's too bad we can't collect the
previously-but-not-currently-pinned pages during the iterative phase.
Thanks,

Alex
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6921a78e9ba5..0518cf228ed5 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -29,6 +29,7 @@ 
 #include "hw/vfio/vfio.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
+#include "exec/ram_addr.h"
 #include "hw/hw.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
@@ -38,6 +39,7 @@ 
 #include "sysemu/reset.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "migration/migration.h"
 
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -288,6 +290,28 @@  const MemoryRegionOps vfio_region_ops = {
 };
 
 /*
+ * Device state interfaces
+ */
+
+static bool vfio_devices_are_stopped_and_saving(void)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
+                !(vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
+                continue;
+            } else {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+/*
  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  */
 static int vfio_dma_unmap(VFIOContainer *container,
@@ -852,9 +876,115 @@  static void vfio_listener_region_del(MemoryListener *listener,
     }
 }
 
+static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+                                 uint64_t size, ram_addr_t ram_addr)
+{
+    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
+    struct vfio_iommu_type1_dirty_bitmap_get *range;
+    uint64_t pages;
+    int ret;
+
+    dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
+
+    dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
+    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+    range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
+    range->iova = iova;
+    range->size = size;
+
+    /*
+     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to
+     * TARGET_PAGE_SIZE.
+     */
+    range->bitmap.pgsize = TARGET_PAGE_SIZE;
+
+    pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS;
+    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
+                                         BITS_PER_BYTE;
+    range->bitmap.data = g_try_malloc0(range->bitmap.size);
+    if (!range->bitmap.data) {
+        ret = -ENOMEM;
+        goto err_out;
+    }
+
+    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
+    if (ret) {
+        error_report("Failed to get dirty bitmap for iova: 0x%llx "
+                "size: 0x%llx err: %d",
+                range->iova, range->size, errno);
+        goto err_out;
+    }
+
+    cpu_physical_memory_set_dirty_lebitmap((uint64_t *)range->bitmap.data,
+                                            ram_addr, pages);
+
+    trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
+                                range->bitmap.size, ram_addr);
+err_out:
+    g_free(range->bitmap.data);
+    g_free(dbitmap);
+
+    return ret;
+}
+
+static int vfio_sync_dirty_bitmap(MemoryListener *listener,
+                                 MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    VFIOGuestIOMMU *giommu = NULL;
+    ram_addr_t ram_addr;
+    uint64_t iova, size;
+    int ret = 0;
+
+    if (memory_region_is_iommu(section->mr)) {
+
+        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+            if (MEMORY_REGION(giommu->iommu) == section->mr &&
+                giommu->n.start == section->offset_within_region) {
+                VFIOIovaRange *iova_range;
+
+                QLIST_FOREACH(iova_range, &giommu->iova_list, next) {
+                    ret = vfio_get_dirty_bitmap(container, iova_range->iova,
+                                        iova_range->size, iova_range->ram_addr);
+                    if (ret) {
+                        break;
+                    }
+                }
+                break;
+            }
+        }
+
+    } else {
+        iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+        size = int128_get64(section->size);
+
+        ram_addr = memory_region_get_ram_addr(section->mr) +
+                   section->offset_within_region + iova -
+                   TARGET_PAGE_ALIGN(section->offset_within_address_space);
+
+        ret = vfio_get_dirty_bitmap(container, iova, size, ram_addr);
+    }
+
+    return ret;
+}
+
+static void vfio_listerner_log_sync(MemoryListener *listener,
+        MemoryRegionSection *section)
+{
+    if (vfio_listener_skipped_section(section)) {
+        return;
+    }
+
+    if (vfio_devices_are_stopped_and_saving()) {
+        vfio_sync_dirty_bitmap(listener, section);
+    }
+}
+
 static const MemoryListener vfio_memory_listener = {
     .region_add = vfio_listener_region_add,
     .region_del = vfio_listener_region_del,
+    .log_sync = vfio_listerner_log_sync,
 };
 
 static void vfio_listener_release(VFIOContainer *container)
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 4a4bd3ba9a2a..c61ae4f3ead8 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -160,3 +160,4 @@  vfio_save_complete_precopy(const char *name) " (%s)"
 vfio_load_device_config_state(const char *name) " (%s)"
 vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
 vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
+vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64