diff mbox series

[5/6] vfio/migration: Block VFIO migration with postcopy migration

Message ID 20230828151842.11303-6-avihaih@nvidia.com
State New
Headers show
Series vfio/migration: Block VFIO migration with postcopy and background snapshot | expand

Commit Message

Avihai Horon Aug. 28, 2023, 3:18 p.m. UTC
VFIO migration is not compatible with postcopy migration. A VFIO device
in the destination can't handle page faults for pages that have not been
sent yet.

Doing such migration will cause the VM to crash in the destination:

qemu-system-x86_64: VFIO_MAP_DMA failed: Bad address
qemu-system-x86_64: vfio_dma_map(0x55a28c7659d0, 0xc0000, 0xb000, 0x7f1b11a00000) = -14 (Bad address)
qemu: hardware error: vfio: DMA mapping failed, unable to continue

To prevent this and to be explicit about supported features, block VFIO
migration with postcopy migration: Fail setting postcopy capability if a
VFIO device is present, and add a migration blocker if a VFIO device is
added when postcopy capability is on.

Reported-by: Yanghang Liu <yanghliu@redhat.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 include/hw/vfio/vfio-common.h |  2 ++
 migration/migration.h         |  2 ++
 hw/vfio/common.c              | 43 +++++++++++++++++++++++++++++++++++
 hw/vfio/migration.c           |  6 +++++
 migration/options.c           | 19 ++++++++++++++++
 migration/target.c            | 19 ++++++++++++++++
 6 files changed, 91 insertions(+)

Comments

Cédric Le Goater Aug. 29, 2023, 1:24 p.m. UTC | #1
On 8/28/23 17:18, Avihai Horon wrote:
> VFIO migration is not compatible with postcopy migration. A VFIO device
> in the destination can't handle page faults for pages that have not been
> sent yet.
> 
> Doing such migration will cause the VM to crash in the destination:
> 
> qemu-system-x86_64: VFIO_MAP_DMA failed: Bad address
> qemu-system-x86_64: vfio_dma_map(0x55a28c7659d0, 0xc0000, 0xb000, 0x7f1b11a00000) = -14 (Bad address)
> qemu: hardware error: vfio: DMA mapping failed, unable to continue
> 
> To prevent this and to be explicit about supported features, block VFIO
> migration with postcopy migration: Fail setting postcopy capability if a
> VFIO device is present, and add a migration blocker if a VFIO device is
> added when postcopy capability is on.
> 
> Reported-by: Yanghang Liu <yanghliu@redhat.com>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
>   include/hw/vfio/vfio-common.h |  2 ++
>   migration/migration.h         |  2 ++
>   hw/vfio/common.c              | 43 +++++++++++++++++++++++++++++++++++
>   hw/vfio/migration.c           |  6 +++++
>   migration/options.c           | 19 ++++++++++++++++
>   migration/target.c            | 19 ++++++++++++++++
>   6 files changed, 91 insertions(+)
> 
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index e9b8954595..c0b58f2bb7 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -227,6 +227,8 @@ extern VFIOGroupList vfio_group_list;
>   bool vfio_mig_active(void);
>   int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp);
>   void vfio_unblock_multiple_devices_migration(void);
> +int vfio_block_postcopy_migration(VFIODevice *vbasedev, Error **errp);
> +void vfio_unblock_postcopy_migration(void);
>   bool vfio_viommu_preset(VFIODevice *vbasedev);
>   int64_t vfio_mig_bytes_transferred(void);
>   void vfio_reset_bytes_transferred(void);
> diff --git a/migration/migration.h b/migration/migration.h
> index c5695de214..21a6423408 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -514,6 +514,8 @@ void migration_cancel(const Error *error);
>   
>   void migration_populate_vfio_info(MigrationInfo *info);
>   void migration_reset_vfio_bytes_transferred(void);
> +bool migration_vfio_mig_active(void);
> +void migration_vfio_unblock_postcopy_migration(void);
>   void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
>   
>   #endif
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 373f6e5932..7461194b2b 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -40,6 +40,7 @@
>   #include "trace.h"
>   #include "qapi/error.h"
>   #include "migration/migration.h"
> +#include "migration/options.h"
>   #include "migration/misc.h"
>   #include "migration/blocker.h"
>   #include "migration/qemu-file.h"
> @@ -343,6 +344,7 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
>                                    uint64_t size, ram_addr_t ram_addr);
>   
>   static Error *multiple_devices_migration_blocker;
> +static Error *postcopy_migration_blocker;
>   
>   static unsigned int vfio_migratable_devices_num(void)
>   {
> @@ -427,6 +429,47 @@ void vfio_unblock_multiple_devices_migration(void)
>       multiple_devices_migration_blocker = NULL;
>   }
>   
> +int vfio_block_postcopy_migration(VFIODevice *vbasedev, Error **errp)
> +{
> +    int ret;
> +
> +    if (!migrate_postcopy_ram()) {
> +        return 0;
> +    }
> +
> +    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
> +        error_setg(errp,
> +                   "VFIO migration is not compatible with postcopy migration");
> +        return -EINVAL;
> +    }
> +
> +    if (postcopy_migration_blocker) {
> +        return 0;
> +    }
> +
> +    error_setg(&postcopy_migration_blocker,
> +               "VFIO migration is not compatible with postcopy migration");
> +    ret = migrate_add_blocker(postcopy_migration_blocker, errp);
> +    if (ret < 0) {
> +        error_free(postcopy_migration_blocker);
> +        postcopy_migration_blocker = NULL;
> +    }
> +
> +    return ret;
> +}
> +
> +void vfio_unblock_postcopy_migration(void)
> +{
> +    if (!postcopy_migration_blocker ||
> +        (vfio_migratable_devices_num() && migrate_postcopy_ram())) {
> +        return;
> +    }
> +
> +    migrate_del_blocker(postcopy_migration_blocker);
> +    error_free(postcopy_migration_blocker);
> +    postcopy_migration_blocker = NULL;
> +}
> +
>   bool vfio_mig_active(void)
>   {
>       return vfio_migratable_devices_num();
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 71855468fe..76406e9ae9 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -856,6 +856,7 @@ static void vfio_migration_deinit(VFIODevice *vbasedev)
>       unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
>       vfio_migration_free(vbasedev);
>       vfio_unblock_multiple_devices_migration();
> +    vfio_unblock_postcopy_migration();
>   }
>   
>   static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
> @@ -939,6 +940,11 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
>           goto out_deinit;
>       }
>   
> +    ret = vfio_block_postcopy_migration(vbasedev, errp);
> +    if (ret) {
> +        goto out_deinit;
> +    }
> +
>       if (vfio_viommu_preset(vbasedev)) {
>           error_setg(&err, "%s: Migration is currently not supported "
>                      "with vIOMMU enabled", vbasedev->name);
> diff --git a/migration/options.c b/migration/options.c
> index 1d1e1321b0..e201053563 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>               error_setg(errp, "Postcopy is not yet compatible with multifd");
>               return false;
>           }
> +
> +        if (migration_vfio_mig_active()) {
> +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
> +            return false;
> +        }
>       }
>   
>       if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
> @@ -612,6 +617,16 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>       return true;
>   }
>   
> +/*
> + * Devices might have added migration blockers based on migration capabilities
> + * values when those devices were added. Remove such blockers according to new
> + * changes in migration capabilities.
> + */
> +static void migration_caps_remove_blockers(void)
> +{
> +    migration_vfio_unblock_postcopy_migration();
> +}
> +
>   bool migrate_cap_set(int cap, bool value, Error **errp)
>   {
>       MigrationState *s = migrate_get_current();
> @@ -629,6 +644,8 @@ bool migrate_cap_set(int cap, bool value, Error **errp)
>           return false;
>       }
>       s->capabilities[cap] = value;
> +    migration_caps_remove_blockers();
> +
>       return true;
>   }
>   
> @@ -678,6 +695,8 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
>       for (cap = params; cap; cap = cap->next) {
>           s->capabilities[cap->value->capability] = cap->value->state;
>       }
> +
> +    migration_caps_remove_blockers();
>   }
>   
>   /* parameters */
> diff --git a/migration/target.c b/migration/target.c
> index a6ffa9a5ce..690ecb4dd5 100644
> --- a/migration/target.c
> +++ b/migration/target.c
> @@ -27,6 +27,16 @@ void migration_reset_vfio_bytes_transferred(void)
>   {
>       vfio_reset_bytes_transferred();
>   }
> +
> +bool migration_vfio_mig_active(void)
> +{
> +    return vfio_mig_active();
> +}
> +
> +void migration_vfio_unblock_postcopy_migration(void)
> +{
> +    vfio_unblock_postcopy_migration();
> +}
>   #else
>   void migration_populate_vfio_info(MigrationInfo *info)
>   {
> @@ -35,4 +45,13 @@ void migration_populate_vfio_info(MigrationInfo *info)
>   void migration_reset_vfio_bytes_transferred(void)
>   {
>   }
> +
> +bool migration_vfio_mig_active(void)
> +{
> +    return false;
> +}
> +
> +void migration_vfio_unblock_postcopy_migration()


Missing 'void' above.

Thanks,

C.


> +{
> +}
>   #endif
Peter Xu Aug. 29, 2023, 2:53 p.m. UTC | #2
On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
> diff --git a/migration/options.c b/migration/options.c
> index 1d1e1321b0..e201053563 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>              error_setg(errp, "Postcopy is not yet compatible with multifd");
>              return false;
>          }
> +
> +        if (migration_vfio_mig_active()) {
> +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
> +            return false;
> +        }

Hmm.. this will add yet another vfio hard-coded line into migration/..

What will happen if the vfio device is hot plugged after enabling
postcopy-ram here?

Is it possible to do it in a generic way?

I was thinking the only unified place to do such check is when migration
starts, as long as we switch to SETUP all caps are locked and doesn't allow
any change until it finishes or fails.

So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
to fail the whole migration early?  For example, maybe we should have an
Error** passed in, then if it fails it calls migrate_set_error, so
reflected in query-migrate later too.

Thanks,

>      }
>  
>      if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
> @@ -612,6 +617,16 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>      return true;
>  }
Avihai Horon Aug. 29, 2023, 3:52 p.m. UTC | #3
On 29/08/2023 16:24, Cédric Le Goater wrote:
> External email: Use caution opening links or attachments
>
>
> On 8/28/23 17:18, Avihai Horon wrote:
>> VFIO migration is not compatible with postcopy migration. A VFIO device
>> in the destination can't handle page faults for pages that have not been
>> sent yet.
>>
>> Doing such migration will cause the VM to crash in the destination:
>>
>> qemu-system-x86_64: VFIO_MAP_DMA failed: Bad address
>> qemu-system-x86_64: vfio_dma_map(0x55a28c7659d0, 0xc0000, 0xb000, 
>> 0x7f1b11a00000) = -14 (Bad address)
>> qemu: hardware error: vfio: DMA mapping failed, unable to continue
>>
>> To prevent this and to be explicit about supported features, block VFIO
>> migration with postcopy migration: Fail setting postcopy capability if a
>> VFIO device is present, and add a migration blocker if a VFIO device is
>> added when postcopy capability is on.
>>
>> Reported-by: Yanghang Liu <yanghliu@redhat.com>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> ---
>>   include/hw/vfio/vfio-common.h |  2 ++
>>   migration/migration.h         |  2 ++
>>   hw/vfio/common.c              | 43 +++++++++++++++++++++++++++++++++++
>>   hw/vfio/migration.c           |  6 +++++
>>   migration/options.c           | 19 ++++++++++++++++
>>   migration/target.c            | 19 ++++++++++++++++
>>   6 files changed, 91 insertions(+)
>>
>> diff --git a/include/hw/vfio/vfio-common.h 
>> b/include/hw/vfio/vfio-common.h
>> index e9b8954595..c0b58f2bb7 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -227,6 +227,8 @@ extern VFIOGroupList vfio_group_list;
>>   bool vfio_mig_active(void);
>>   int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, 
>> Error **errp);
>>   void vfio_unblock_multiple_devices_migration(void);
>> +int vfio_block_postcopy_migration(VFIODevice *vbasedev, Error **errp);
>> +void vfio_unblock_postcopy_migration(void);
>>   bool vfio_viommu_preset(VFIODevice *vbasedev);
>>   int64_t vfio_mig_bytes_transferred(void);
>>   void vfio_reset_bytes_transferred(void);
>> diff --git a/migration/migration.h b/migration/migration.h
>> index c5695de214..21a6423408 100644
>> --- a/migration/migration.h
>> +++ b/migration/migration.h
>> @@ -514,6 +514,8 @@ void migration_cancel(const Error *error);
>>
>>   void migration_populate_vfio_info(MigrationInfo *info);
>>   void migration_reset_vfio_bytes_transferred(void);
>> +bool migration_vfio_mig_active(void);
>> +void migration_vfio_unblock_postcopy_migration(void);
>>   void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
>>
>>   #endif
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 373f6e5932..7461194b2b 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -40,6 +40,7 @@
>>   #include "trace.h"
>>   #include "qapi/error.h"
>>   #include "migration/migration.h"
>> +#include "migration/options.h"
>>   #include "migration/misc.h"
>>   #include "migration/blocker.h"
>>   #include "migration/qemu-file.h"
>> @@ -343,6 +344,7 @@ static int vfio_get_dirty_bitmap(VFIOContainer 
>> *container, uint64_t iova,
>>                                    uint64_t size, ram_addr_t ram_addr);
>>
>>   static Error *multiple_devices_migration_blocker;
>> +static Error *postcopy_migration_blocker;
>>
>>   static unsigned int vfio_migratable_devices_num(void)
>>   {
>> @@ -427,6 +429,47 @@ void vfio_unblock_multiple_devices_migration(void)
>>       multiple_devices_migration_blocker = NULL;
>>   }
>>
>> +int vfio_block_postcopy_migration(VFIODevice *vbasedev, Error **errp)
>> +{
>> +    int ret;
>> +
>> +    if (!migrate_postcopy_ram()) {
>> +        return 0;
>> +    }
>> +
>> +    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
>> +        error_setg(errp,
>> +                   "VFIO migration is not compatible with postcopy 
>> migration");
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (postcopy_migration_blocker) {
>> +        return 0;
>> +    }
>> +
>> +    error_setg(&postcopy_migration_blocker,
>> +               "VFIO migration is not compatible with postcopy 
>> migration");
>> +    ret = migrate_add_blocker(postcopy_migration_blocker, errp);
>> +    if (ret < 0) {
>> +        error_free(postcopy_migration_blocker);
>> +        postcopy_migration_blocker = NULL;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +void vfio_unblock_postcopy_migration(void)
>> +{
>> +    if (!postcopy_migration_blocker ||
>> +        (vfio_migratable_devices_num() && migrate_postcopy_ram())) {
>> +        return;
>> +    }
>> +
>> +    migrate_del_blocker(postcopy_migration_blocker);
>> +    error_free(postcopy_migration_blocker);
>> +    postcopy_migration_blocker = NULL;
>> +}
>> +
>>   bool vfio_mig_active(void)
>>   {
>>       return vfio_migratable_devices_num();
>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>> index 71855468fe..76406e9ae9 100644
>> --- a/hw/vfio/migration.c
>> +++ b/hw/vfio/migration.c
>> @@ -856,6 +856,7 @@ static void vfio_migration_deinit(VFIODevice 
>> *vbasedev)
>>       unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
>>       vfio_migration_free(vbasedev);
>>       vfio_unblock_multiple_devices_migration();
>> +    vfio_unblock_postcopy_migration();
>>   }
>>
>>   static int vfio_block_migration(VFIODevice *vbasedev, Error *err, 
>> Error **errp)
>> @@ -939,6 +940,11 @@ bool vfio_migration_realize(VFIODevice 
>> *vbasedev, Error **errp)
>>           goto out_deinit;
>>       }
>>
>> +    ret = vfio_block_postcopy_migration(vbasedev, errp);
>> +    if (ret) {
>> +        goto out_deinit;
>> +    }
>> +
>>       if (vfio_viommu_preset(vbasedev)) {
>>           error_setg(&err, "%s: Migration is currently not supported "
>>                      "with vIOMMU enabled", vbasedev->name);
>> diff --git a/migration/options.c b/migration/options.c
>> index 1d1e1321b0..e201053563 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c
>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool 
>> *new_caps, Error **errp)
>>               error_setg(errp, "Postcopy is not yet compatible with 
>> multifd");
>>               return false;
>>           }
>> +
>> +        if (migration_vfio_mig_active()) {
>> +            error_setg(errp, "Postcopy is not compatible with VFIO 
>> migration");
>> +            return false;
>> +        }
>>       }
>>
>>       if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
>> @@ -612,6 +617,16 @@ bool migrate_caps_check(bool *old_caps, bool 
>> *new_caps, Error **errp)
>>       return true;
>>   }
>>
>> +/*
>> + * Devices might have added migration blockers based on migration 
>> capabilities
>> + * values when those devices were added. Remove such blockers 
>> according to new
>> + * changes in migration capabilities.
>> + */
>> +static void migration_caps_remove_blockers(void)
>> +{
>> +    migration_vfio_unblock_postcopy_migration();
>> +}
>> +
>>   bool migrate_cap_set(int cap, bool value, Error **errp)
>>   {
>>       MigrationState *s = migrate_get_current();
>> @@ -629,6 +644,8 @@ bool migrate_cap_set(int cap, bool value, Error 
>> **errp)
>>           return false;
>>       }
>>       s->capabilities[cap] = value;
>> +    migration_caps_remove_blockers();
>> +
>>       return true;
>>   }
>>
>> @@ -678,6 +695,8 @@ void 
>> qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
>>       for (cap = params; cap; cap = cap->next) {
>>           s->capabilities[cap->value->capability] = cap->value->state;
>>       }
>> +
>> +    migration_caps_remove_blockers();
>>   }
>>
>>   /* parameters */
>> diff --git a/migration/target.c b/migration/target.c
>> index a6ffa9a5ce..690ecb4dd5 100644
>> --- a/migration/target.c
>> +++ b/migration/target.c
>> @@ -27,6 +27,16 @@ void migration_reset_vfio_bytes_transferred(void)
>>   {
>>       vfio_reset_bytes_transferred();
>>   }
>> +
>> +bool migration_vfio_mig_active(void)
>> +{
>> +    return vfio_mig_active();
>> +}
>> +
>> +void migration_vfio_unblock_postcopy_migration(void)
>> +{
>> +    vfio_unblock_postcopy_migration();
>> +}
>>   #else
>>   void migration_populate_vfio_info(MigrationInfo *info)
>>   {
>> @@ -35,4 +45,13 @@ void migration_populate_vfio_info(MigrationInfo 
>> *info)
>>   void migration_reset_vfio_bytes_transferred(void)
>>   {
>>   }
>> +
>> +bool migration_vfio_mig_active(void)
>> +{
>> +    return false;
>> +}
>> +
>> +void migration_vfio_unblock_postcopy_migration()
>
>
> Missing 'void' above.
>
Oh, right. Next time I will also test compilation without VFIO config.

Thanks.
Avihai Horon Aug. 29, 2023, 4:20 p.m. UTC | #4
On 29/08/2023 17:53, Peter Xu wrote:
> External email: Use caution opening links or attachments
>
>
> On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
>> diff --git a/migration/options.c b/migration/options.c
>> index 1d1e1321b0..e201053563 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c
>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>>               error_setg(errp, "Postcopy is not yet compatible with multifd");
>>               return false;
>>           }
>> +
>> +        if (migration_vfio_mig_active()) {
>> +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
>> +            return false;
>> +        }
> Hmm.. this will add yet another vfio hard-coded line into migration/..
>
> What will happen if the vfio device is hot plugged after enabling
> postcopy-ram here?

In that case a migration blocker will be added.

>
> Is it possible to do it in a generic way?

What comes to my mind is to let devices register a handler for a "caps 
change" notification and allow them to object.
But maybe that's a bit of an overkill.

>
> I was thinking the only unified place to do such check is when migration
> starts, as long as we switch to SETUP all caps are locked and doesn't allow
> any change until it finishes or fails.
>
> So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
> to fail the whole migration early?  For example, maybe we should have an
> Error** passed in, then if it fails it calls migrate_set_error, so
> reflected in query-migrate later too.

Yes, I think this could work and it will simplify things because we 
could also drop the VFIO migration blockers code.
The downside is that the user will know migration is blocked only when 
he tries to migrate, and migrate_caps_check() will not block setting 
postcopy when a VFIO device is already attached.
I don't have a strong opinion here, so if it's fine by you and everyone 
else, I could change that to what you suggested.

Thanks.
Peter Xu Aug. 29, 2023, 6:27 p.m. UTC | #5
On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
> 
> On 29/08/2023 17:53, Peter Xu wrote:
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
> > > diff --git a/migration/options.c b/migration/options.c
> > > index 1d1e1321b0..e201053563 100644
> > > --- a/migration/options.c
> > > +++ b/migration/options.c
> > > @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
> > >               error_setg(errp, "Postcopy is not yet compatible with multifd");
> > >               return false;
> > >           }
> > > +
> > > +        if (migration_vfio_mig_active()) {
> > > +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
> > > +            return false;
> > > +        }
> > Hmm.. this will add yet another vfio hard-coded line into migration/..
> > 
> > What will happen if the vfio device is hot plugged after enabling
> > postcopy-ram here?
> 
> In that case a migration blocker will be added.
> 
> > 
> > Is it possible to do it in a generic way?
> 
> What comes to my mind is to let devices register a handler for a "caps
> change" notification and allow them to object.
> But maybe that's a bit of an overkill.

This one also sounds better than hard-codes to me.

> 
> > 
> > I was thinking the only unified place to do such check is when migration
> > starts, as long as we switch to SETUP all caps are locked and doesn't allow
> > any change until it finishes or fails.
> > 
> > So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
> > to fail the whole migration early?  For example, maybe we should have an
> > Error** passed in, then if it fails it calls migrate_set_error, so
> > reflected in query-migrate later too.
> 
> Yes, I think this could work and it will simplify things because we could
> also drop the VFIO migration blockers code.
> The downside is that the user will know migration is blocked only when he
> tries to migrate, and migrate_caps_check() will not block setting postcopy
> when a VFIO device is already attached.
> I don't have a strong opinion here, so if it's fine by you and everyone
> else, I could change that to what you suggested.

Failing later would be fine in this case to me; my expectation is VFIO
users should be advanced already anyway (as the whole solution is still
pretty involved comparing to a generic VM migration) and shouldn't try to
trigger that at all in real life.  IOW I'd expect this check will be there
just for sanity, rather than being relied on to let people be aware of it
by the error message.

Meanwhile the blocker + caps check is slightly complicated to me to guard
both sides.  So I'd vote for failing at the QMP command.  But we can wait
and see whether there's other votes.

Thanks,
Avihai Horon Aug. 30, 2023, 7:01 a.m. UTC | #6
On 29/08/2023 21:27, Peter Xu wrote:
> External email: Use caution opening links or attachments
>
>
> On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
>> On 29/08/2023 17:53, Peter Xu wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
>>>> diff --git a/migration/options.c b/migration/options.c
>>>> index 1d1e1321b0..e201053563 100644
>>>> --- a/migration/options.c
>>>> +++ b/migration/options.c
>>>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>>>>                error_setg(errp, "Postcopy is not yet compatible with multifd");
>>>>                return false;
>>>>            }
>>>> +
>>>> +        if (migration_vfio_mig_active()) {
>>>> +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
>>>> +            return false;
>>>> +        }
>>> Hmm.. this will add yet another vfio hard-coded line into migration/..
>>>
>>> What will happen if the vfio device is hot plugged after enabling
>>> postcopy-ram here?
>> In that case a migration blocker will be added.
>>
>>> Is it possible to do it in a generic way?
>> What comes to my mind is to let devices register a handler for a "caps
>> change" notification and allow them to object.
>> But maybe that's a bit of an overkill.
> This one also sounds better than hard-codes to me.
>
>>> I was thinking the only unified place to do such check is when migration
>>> starts, as long as we switch to SETUP all caps are locked and doesn't allow
>>> any change until it finishes or fails.
>>>
>>> So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
>>> to fail the whole migration early?  For example, maybe we should have an
>>> Error** passed in, then if it fails it calls migrate_set_error, so
>>> reflected in query-migrate later too.
>> Yes, I think this could work and it will simplify things because we could
>> also drop the VFIO migration blockers code.
>> The downside is that the user will know migration is blocked only when he
>> tries to migrate, and migrate_caps_check() will not block setting postcopy
>> when a VFIO device is already attached.
>> I don't have a strong opinion here, so if it's fine by you and everyone
>> else, I could change that to what you suggested.
> Failing later would be fine in this case to me; my expectation is VFIO
> users should be advanced already anyway (as the whole solution is still
> pretty involved comparing to a generic VM migration) and shouldn't try to
> trigger that at all in real life.  IOW I'd expect this check will be there
> just for sanity, rather than being relied on to let people be aware of it
> by the error message.

Yes, I agree with you.

>
> Meanwhile the blocker + caps check is slightly complicated to me to guard
> both sides.  So I'd vote for failing at the QMP command.  But we can wait
> and see whether there's other votes.

Sure.
So I will do the checking in vfio_save_setup(), unless someone else has 
a better idea.

Thanks.
Cédric Le Goater Aug. 30, 2023, 8:37 a.m. UTC | #7
On 8/30/23 09:01, Avihai Horon wrote:
> 
> On 29/08/2023 21:27, Peter Xu wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
>>> On 29/08/2023 17:53, Peter Xu wrote:
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
>>>>> diff --git a/migration/options.c b/migration/options.c
>>>>> index 1d1e1321b0..e201053563 100644
>>>>> --- a/migration/options.c
>>>>> +++ b/migration/options.c
>>>>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>>>>>                error_setg(errp, "Postcopy is not yet compatible with multifd");
>>>>>                return false;
>>>>>            }
>>>>> +
>>>>> +        if (migration_vfio_mig_active()) {
>>>>> +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
>>>>> +            return false;
>>>>> +        }
>>>> Hmm.. this will add yet another vfio hard-coded line into migration/..
>>>>
>>>> What will happen if the vfio device is hot plugged after enabling
>>>> postcopy-ram here?
>>> In that case a migration blocker will be added.
>>>
>>>> Is it possible to do it in a generic way?
>>> What comes to my mind is to let devices register a handler for a "caps
>>> change" notification and allow them to object.
>>> But maybe that's a bit of an overkill.
>> This one also sounds better than hard-codes to me.
>>
>>>> I was thinking the only unified place to do such check is when migration
>>>> starts, as long as we switch to SETUP all caps are locked and doesn't allow
>>>> any change until it finishes or fails.
>>>>
>>>> So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
>>>> to fail the whole migration early?  For example, maybe we should have an
>>>> Error** passed in, then if it fails it calls migrate_set_error, so
>>>> reflected in query-migrate later too.
>>> Yes, I think this could work and it will simplify things because we could
>>> also drop the VFIO migration blockers code.
>>> The downside is that the user will know migration is blocked only when he
>>> tries to migrate, and migrate_caps_check() will not block setting postcopy
>>> when a VFIO device is already attached.
>>> I don't have a strong opinion here, so if it's fine by you and everyone
>>> else, I could change that to what you suggested.
>> Failing later would be fine in this case to me; my expectation is VFIO
>> users should be advanced already anyway (as the whole solution is still
>> pretty involved comparing to a generic VM migration) and shouldn't try to
>> trigger that at all in real life.  IOW I'd expect this check will be there
>> just for sanity, rather than being relied on to let people be aware of it
>> by the error message.
> 
> Yes, I agree with you.
> 
>>
>> Meanwhile the blocker + caps check is slightly complicated to me to guard
>> both sides.  So I'd vote for failing at the QMP command.  But we can wait
>> and see whether there's other votes.
> 
> Sure.
> So I will do the checking in vfio_save_setup(), unless someone else has a better idea.

Just to recap for my understanding,

vfio_save_setup() would test migrate_postcopy_ram() and update a new
'Error *err' parameter of the .save_setup() op which would be taken
into account in qemu_savevm_state_setup(). Is that correct ?


Thanks,

C.
Avihai Horon Aug. 30, 2023, 9:21 a.m. UTC | #8
On 30/08/2023 11:37, Cédric Le Goater wrote:
> External email: Use caution opening links or attachments
>
>
> On 8/30/23 09:01, Avihai Horon wrote:
>>
>> On 29/08/2023 21:27, Peter Xu wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
>>>> On 29/08/2023 17:53, Peter Xu wrote:
>>>>> External email: Use caution opening links or attachments
>>>>>
>>>>>
>>>>> On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
>>>>>> diff --git a/migration/options.c b/migration/options.c
>>>>>> index 1d1e1321b0..e201053563 100644
>>>>>> --- a/migration/options.c
>>>>>> +++ b/migration/options.c
>>>>>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool 
>>>>>> *new_caps, Error **errp)
>>>>>>                error_setg(errp, "Postcopy is not yet compatible 
>>>>>> with multifd");
>>>>>>                return false;
>>>>>>            }
>>>>>> +
>>>>>> +        if (migration_vfio_mig_active()) {
>>>>>> +            error_setg(errp, "Postcopy is not compatible with 
>>>>>> VFIO migration");
>>>>>> +            return false;
>>>>>> +        }
>>>>> Hmm.. this will add yet another vfio hard-coded line into 
>>>>> migration/..
>>>>>
>>>>> What will happen if the vfio device is hot plugged after enabling
>>>>> postcopy-ram here?
>>>> In that case a migration blocker will be added.
>>>>
>>>>> Is it possible to do it in a generic way?
>>>> What comes to my mind is to let devices register a handler for a "caps
>>>> change" notification and allow them to object.
>>>> But maybe that's a bit of an overkill.
>>> This one also sounds better than hard-codes to me.
>>>
>>>>> I was thinking the only unified place to do such check is when 
>>>>> migration
>>>>> starts, as long as we switch to SETUP all caps are locked and 
>>>>> doesn't allow
>>>>> any change until it finishes or fails.
>>>>>
>>>>> So, can we do this check inside vfio_save_setup(), allow 
>>>>> vfio_save_setup()
>>>>> to fail the whole migration early?  For example, maybe we should 
>>>>> have an
>>>>> Error** passed in, then if it fails it calls migrate_set_error, so
>>>>> reflected in query-migrate later too.
>>>> Yes, I think this could work and it will simplify things because we 
>>>> could
>>>> also drop the VFIO migration blockers code.
>>>> The downside is that the user will know migration is blocked only 
>>>> when he
>>>> tries to migrate, and migrate_caps_check() will not block setting 
>>>> postcopy
>>>> when a VFIO device is already attached.
>>>> I don't have a strong opinion here, so if it's fine by you and 
>>>> everyone
>>>> else, I could change that to what you suggested.
>>> Failing later would be fine in this case to me; my expectation is VFIO
>>> users should be advanced already anyway (as the whole solution is still
>>> pretty involved comparing to a generic VM migration) and shouldn't 
>>> try to
>>> trigger that at all in real life.  IOW I'd expect this check will be 
>>> there
>>> just for sanity, rather than being relied on to let people be aware 
>>> of it
>>> by the error message.
>>
>> Yes, I agree with you.
>>
>>>
>>> Meanwhile the blocker + caps check is slightly complicated to me to 
>>> guard
>>> both sides.  So I'd vote for failing at the QMP command.  But we can 
>>> wait
>>> and see whether there's other votes.
>>
>> Sure.
>> So I will do the checking in vfio_save_setup(), unless someone else 
>> has a better idea.
>
> Just to recap for my understanding,
>
> vfio_save_setup() would test migrate_postcopy_ram() and update a new
> 'Error *err' parameter of the .save_setup() op which would be taken
> into account in qemu_savevm_state_setup(). Is that correct ?
>
Yes.
But I wonder if it would be simpler to call migrate_set_error() directly 
from vfio_save_setup() instead of adding "Error *err" argument to 
.save_setup() and changing all other users.
What do you prefer?

Thanks.
Cédric Le Goater Aug. 30, 2023, 9:53 a.m. UTC | #9
On 8/30/23 11:21, Avihai Horon wrote:
> 
> On 30/08/2023 11:37, Cédric Le Goater wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 8/30/23 09:01, Avihai Horon wrote:
>>>
>>> On 29/08/2023 21:27, Peter Xu wrote:
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
>>>>> On 29/08/2023 17:53, Peter Xu wrote:
>>>>>> External email: Use caution opening links or attachments
>>>>>>
>>>>>>
>>>>>> On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
>>>>>>> diff --git a/migration/options.c b/migration/options.c
>>>>>>> index 1d1e1321b0..e201053563 100644
>>>>>>> --- a/migration/options.c
>>>>>>> +++ b/migration/options.c
>>>>>>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>>>>>>>                error_setg(errp, "Postcopy is not yet compatible with multifd");
>>>>>>>                return false;
>>>>>>>            }
>>>>>>> +
>>>>>>> +        if (migration_vfio_mig_active()) {
>>>>>>> +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
>>>>>>> +            return false;
>>>>>>> +        }
>>>>>> Hmm.. this will add yet another vfio hard-coded line into migration/..
>>>>>>
>>>>>> What will happen if the vfio device is hot plugged after enabling
>>>>>> postcopy-ram here?
>>>>> In that case a migration blocker will be added.
>>>>>
>>>>>> Is it possible to do it in a generic way?
>>>>> What comes to my mind is to let devices register a handler for a "caps
>>>>> change" notification and allow them to object.
>>>>> But maybe that's a bit of an overkill.
>>>> This one also sounds better than hard-codes to me.
>>>>
>>>>>> I was thinking the only unified place to do such check is when migration
>>>>>> starts, as long as we switch to SETUP all caps are locked and doesn't allow
>>>>>> any change until it finishes or fails.
>>>>>>
>>>>>> So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
>>>>>> to fail the whole migration early?  For example, maybe we should have an
>>>>>> Error** passed in, then if it fails it calls migrate_set_error, so
>>>>>> reflected in query-migrate later too.
>>>>> Yes, I think this could work and it will simplify things because we could
>>>>> also drop the VFIO migration blockers code.
>>>>> The downside is that the user will know migration is blocked only when he
>>>>> tries to migrate, and migrate_caps_check() will not block setting postcopy
>>>>> when a VFIO device is already attached.
>>>>> I don't have a strong opinion here, so if it's fine by you and everyone
>>>>> else, I could change that to what you suggested.
>>>> Failing later would be fine in this case to me; my expectation is VFIO
>>>> users should be advanced already anyway (as the whole solution is still
>>>> pretty involved comparing to a generic VM migration) and shouldn't try to
>>>> trigger that at all in real life.  IOW I'd expect this check will be there
>>>> just for sanity, rather than being relied on to let people be aware of it
>>>> by the error message.
>>>
>>> Yes, I agree with you.
>>>
>>>>
>>>> Meanwhile the blocker + caps check is slightly complicated to me to guard
>>>> both sides.  So I'd vote for failing at the QMP command.  But we can wait
>>>> and see whether there's other votes.
>>>
>>> Sure.
>>> So I will do the checking in vfio_save_setup(), unless someone else has a better idea.
>>
>> Just to recap for my understanding,
>>
>> vfio_save_setup() would test migrate_postcopy_ram() and update a new
>> 'Error *err' parameter of the .save_setup() op which would be taken
>> into account in qemu_savevm_state_setup(). Is that correct ?
>>
> Yes.
> But I wonder if it would be simpler to call migrate_set_error() directly from vfio_save_setup() instead of adding "Error *err" argument to .save_setup() and changing all other users.
> What do you prefer?

Well, with my downstreamer hat, I would prefer a simpler solution for the
VFIO postcopy limitation first. That said, there is value in adding
a 'Error *' parameter to the .save_setup() op and letting the top routine
qemu_savevm_state_setup() propagate. Other SaveVMhandler could start using
it. even VFIO has multiple error_report() in vfio_save_setup() which could
be propagated to the top callers.

Let's try that first. I will check your new series on top of 8.0

Thanks,

C.


> 
> Thanks.
>
Avihai Horon Aug. 30, 2023, 10:12 a.m. UTC | #10
On 30/08/2023 12:53, Cédric Le Goater wrote:
> External email: Use caution opening links or attachments
>
>
> On 8/30/23 11:21, Avihai Horon wrote:
>>
>> On 30/08/2023 11:37, Cédric Le Goater wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> On 8/30/23 09:01, Avihai Horon wrote:
>>>>
>>>> On 29/08/2023 21:27, Peter Xu wrote:
>>>>> External email: Use caution opening links or attachments
>>>>>
>>>>>
>>>>> On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
>>>>>> On 29/08/2023 17:53, Peter Xu wrote:
>>>>>>> External email: Use caution opening links or attachments
>>>>>>>
>>>>>>>
>>>>>>> On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
>>>>>>>> diff --git a/migration/options.c b/migration/options.c
>>>>>>>> index 1d1e1321b0..e201053563 100644
>>>>>>>> --- a/migration/options.c
>>>>>>>> +++ b/migration/options.c
>>>>>>>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, 
>>>>>>>> bool *new_caps, Error **errp)
>>>>>>>>                error_setg(errp, "Postcopy is not yet compatible 
>>>>>>>> with multifd");
>>>>>>>>                return false;
>>>>>>>>            }
>>>>>>>> +
>>>>>>>> +        if (migration_vfio_mig_active()) {
>>>>>>>> +            error_setg(errp, "Postcopy is not compatible with 
>>>>>>>> VFIO migration");
>>>>>>>> +            return false;
>>>>>>>> +        }
>>>>>>> Hmm.. this will add yet another vfio hard-coded line into 
>>>>>>> migration/..
>>>>>>>
>>>>>>> What will happen if the vfio device is hot plugged after enabling
>>>>>>> postcopy-ram here?
>>>>>> In that case a migration blocker will be added.
>>>>>>
>>>>>>> Is it possible to do it in a generic way?
>>>>>> What comes to my mind is to let devices register a handler for a 
>>>>>> "caps
>>>>>> change" notification and allow them to object.
>>>>>> But maybe that's a bit of an overkill.
>>>>> This one also sounds better than hard-codes to me.
>>>>>
>>>>>>> I was thinking the only unified place to do such check is when 
>>>>>>> migration
>>>>>>> starts, as long as we switch to SETUP all caps are locked and 
>>>>>>> doesn't allow
>>>>>>> any change until it finishes or fails.
>>>>>>>
>>>>>>> So, can we do this check inside vfio_save_setup(), allow 
>>>>>>> vfio_save_setup()
>>>>>>> to fail the whole migration early?  For example, maybe we should 
>>>>>>> have an
>>>>>>> Error** passed in, then if it fails it calls migrate_set_error, so
>>>>>>> reflected in query-migrate later too.
>>>>>> Yes, I think this could work and it will simplify things because 
>>>>>> we could
>>>>>> also drop the VFIO migration blockers code.
>>>>>> The downside is that the user will know migration is blocked only 
>>>>>> when he
>>>>>> tries to migrate, and migrate_caps_check() will not block setting 
>>>>>> postcopy
>>>>>> when a VFIO device is already attached.
>>>>>> I don't have a strong opinion here, so if it's fine by you and 
>>>>>> everyone
>>>>>> else, I could change that to what you suggested.
>>>>> Failing later would be fine in this case to me; my expectation is 
>>>>> VFIO
>>>>> users should be advanced already anyway (as the whole solution is 
>>>>> still
>>>>> pretty involved comparing to a generic VM migration) and shouldn't 
>>>>> try to
>>>>> trigger that at all in real life.  IOW I'd expect this check will 
>>>>> be there
>>>>> just for sanity, rather than being relied on to let people be 
>>>>> aware of it
>>>>> by the error message.
>>>>
>>>> Yes, I agree with you.
>>>>
>>>>>
>>>>> Meanwhile the blocker + caps check is slightly complicated to me 
>>>>> to guard
>>>>> both sides.  So I'd vote for failing at the QMP command. But we 
>>>>> can wait
>>>>> and see whether there's other votes.
>>>>
>>>> Sure.
>>>> So I will do the checking in vfio_save_setup(), unless someone else 
>>>> has a better idea.
>>>
>>> Just to recap for my understanding,
>>>
>>> vfio_save_setup() would test migrate_postcopy_ram() and update a new
>>> 'Error *err' parameter of the .save_setup() op which would be taken
>>> into account in qemu_savevm_state_setup(). Is that correct ?
>>>
>> Yes.
>> But I wonder if it would be simpler to call migrate_set_error() 
>> directly from vfio_save_setup() instead of adding "Error *err" 
>> argument to .save_setup() and changing all other users.
>> What do you prefer?
>
> Well, with my downstreamer hat, I would prefer a simpler solution for the
> VFIO postcopy limitation first. That said, there is value in adding
> a 'Error *' parameter to the .save_setup() op and letting the top routine
> qemu_savevm_state_setup() propagate. Other SaveVMhandler could start 
> using
> it. even VFIO has multiple error_report() in vfio_save_setup() which 
> could
> be propagated to the top callers.
>
> Let's try that first. I will check your new series on top of 8.0
>
OK, so just making sure, you want to add "Error *err" argument to 
.save_setup() first and see how it goes, right?

Thanks.
Cédric Le Goater Aug. 30, 2023, 11:17 a.m. UTC | #11
On 8/30/23 12:12, Avihai Horon wrote:
> 
> On 30/08/2023 12:53, Cédric Le Goater wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 8/30/23 11:21, Avihai Horon wrote:
>>>
>>> On 30/08/2023 11:37, Cédric Le Goater wrote:
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> On 8/30/23 09:01, Avihai Horon wrote:
>>>>>
>>>>> On 29/08/2023 21:27, Peter Xu wrote:
>>>>>> External email: Use caution opening links or attachments
>>>>>>
>>>>>>
>>>>>> On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
>>>>>>> On 29/08/2023 17:53, Peter Xu wrote:
>>>>>>>> External email: Use caution opening links or attachments
>>>>>>>>
>>>>>>>>
>>>>>>>> On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
>>>>>>>>> diff --git a/migration/options.c b/migration/options.c
>>>>>>>>> index 1d1e1321b0..e201053563 100644
>>>>>>>>> --- a/migration/options.c
>>>>>>>>> +++ b/migration/options.c
>>>>>>>>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>>>>>>>>>                error_setg(errp, "Postcopy is not yet compatible with multifd");
>>>>>>>>>                return false;
>>>>>>>>>            }
>>>>>>>>> +
>>>>>>>>> +        if (migration_vfio_mig_active()) {
>>>>>>>>> +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
>>>>>>>>> +            return false;
>>>>>>>>> +        }
>>>>>>>> Hmm.. this will add yet another vfio hard-coded line into migration/..
>>>>>>>>
>>>>>>>> What will happen if the vfio device is hot plugged after enabling
>>>>>>>> postcopy-ram here?
>>>>>>> In that case a migration blocker will be added.
>>>>>>>
>>>>>>>> Is it possible to do it in a generic way?
>>>>>>> What comes to my mind is to let devices register a handler for a "caps
>>>>>>> change" notification and allow them to object.
>>>>>>> But maybe that's a bit of an overkill.
>>>>>> This one also sounds better than hard-codes to me.
>>>>>>
>>>>>>>> I was thinking the only unified place to do such check is when migration
>>>>>>>> starts, as long as we switch to SETUP all caps are locked and doesn't allow
>>>>>>>> any change until it finishes or fails.
>>>>>>>>
>>>>>>>> So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
>>>>>>>> to fail the whole migration early?  For example, maybe we should have an
>>>>>>>> Error** passed in, then if it fails it calls migrate_set_error, so
>>>>>>>> reflected in query-migrate later too.
>>>>>>> Yes, I think this could work and it will simplify things because we could
>>>>>>> also drop the VFIO migration blockers code.
>>>>>>> The downside is that the user will know migration is blocked only when he
>>>>>>> tries to migrate, and migrate_caps_check() will not block setting postcopy
>>>>>>> when a VFIO device is already attached.
>>>>>>> I don't have a strong opinion here, so if it's fine by you and everyone
>>>>>>> else, I could change that to what you suggested.
>>>>>> Failing later would be fine in this case to me; my expectation is VFIO
>>>>>> users should be advanced already anyway (as the whole solution is still
>>>>>> pretty involved comparing to a generic VM migration) and shouldn't try to
>>>>>> trigger that at all in real life.  IOW I'd expect this check will be there
>>>>>> just for sanity, rather than being relied on to let people be aware of it
>>>>>> by the error message.
>>>>>
>>>>> Yes, I agree with you.
>>>>>
>>>>>>
>>>>>> Meanwhile the blocker + caps check is slightly complicated to me to guard
>>>>>> both sides.  So I'd vote for failing at the QMP command. But we can wait
>>>>>> and see whether there's other votes.
>>>>>
>>>>> Sure.
>>>>> So I will do the checking in vfio_save_setup(), unless someone else has a better idea.
>>>>
>>>> Just to recap for my understanding,
>>>>
>>>> vfio_save_setup() would test migrate_postcopy_ram() and update a new
>>>> 'Error *err' parameter of the .save_setup() op which would be taken
>>>> into account in qemu_savevm_state_setup(). Is that correct ?
>>>>
>>> Yes.
>>> But I wonder if it would be simpler to call migrate_set_error() directly from vfio_save_setup() instead of adding "Error *err" argument to .save_setup() and changing all other users.
>>> What do you prefer?
>>
>> Well, with my downstreamer hat, I would prefer a simpler solution for the
>> VFIO postcopy limitation first. That said, there is value in adding
>> a 'Error *' parameter to the .save_setup() op and letting the top routine
>> qemu_savevm_state_setup() propagate. Other SaveVMhandler could start using
>> it. even VFIO has multiple error_report() in vfio_save_setup() which could
>> be propagated to the top callers.
>>
>> Let's try that first. I will check your new series on top of 8.0
>>
> OK, so just making sure, you want to add "Error *err" argument to .save_setup() first and see how it goes, right?

yes. Sorry. that was not clear.

Thanks

C.
Peter Xu Aug. 30, 2023, 2:22 p.m. UTC | #12
On Wed, Aug 30, 2023 at 01:17:55PM +0200, Cédric Le Goater wrote:
> On 8/30/23 12:12, Avihai Horon wrote:
> > 
> > On 30/08/2023 12:53, Cédric Le Goater wrote:
> > > External email: Use caution opening links or attachments
> > > 
> > > 
> > > On 8/30/23 11:21, Avihai Horon wrote:
> > > > 
> > > > On 30/08/2023 11:37, Cédric Le Goater wrote:
> > > > > External email: Use caution opening links or attachments
> > > > > 
> > > > > 
> > > > > On 8/30/23 09:01, Avihai Horon wrote:
> > > > > > 
> > > > > > On 29/08/2023 21:27, Peter Xu wrote:
> > > > > > > External email: Use caution opening links or attachments
> > > > > > > 
> > > > > > > 
> > > > > > > On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
> > > > > > > > On 29/08/2023 17:53, Peter Xu wrote:
> > > > > > > > > External email: Use caution opening links or attachments
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
> > > > > > > > > > diff --git a/migration/options.c b/migration/options.c
> > > > > > > > > > index 1d1e1321b0..e201053563 100644
> > > > > > > > > > --- a/migration/options.c
> > > > > > > > > > +++ b/migration/options.c
> > > > > > > > > > @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
> > > > > > > > > >                error_setg(errp, "Postcopy is not yet compatible with multifd");
> > > > > > > > > >                return false;
> > > > > > > > > >            }
> > > > > > > > > > +
> > > > > > > > > > +        if (migration_vfio_mig_active()) {
> > > > > > > > > > +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
> > > > > > > > > > +            return false;
> > > > > > > > > > +        }
> > > > > > > > > Hmm.. this will add yet another vfio hard-coded line into migration/..
> > > > > > > > > 
> > > > > > > > > What will happen if the vfio device is hot plugged after enabling
> > > > > > > > > postcopy-ram here?
> > > > > > > > In that case a migration blocker will be added.
> > > > > > > > 
> > > > > > > > > Is it possible to do it in a generic way?
> > > > > > > > What comes to my mind is to let devices register a handler for a "caps
> > > > > > > > change" notification and allow them to object.
> > > > > > > > But maybe that's a bit of an overkill.
> > > > > > > This one also sounds better than hard-codes to me.
> > > > > > > 
> > > > > > > > > I was thinking the only unified place to do such check is when migration
> > > > > > > > > starts, as long as we switch to SETUP all caps are locked and doesn't allow
> > > > > > > > > any change until it finishes or fails.
> > > > > > > > > 
> > > > > > > > > So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
> > > > > > > > > to fail the whole migration early?  For example, maybe we should have an
> > > > > > > > > Error** passed in, then if it fails it calls migrate_set_error, so
> > > > > > > > > reflected in query-migrate later too.
> > > > > > > > Yes, I think this could work and it will simplify things because we could
> > > > > > > > also drop the VFIO migration blockers code.
> > > > > > > > The downside is that the user will know migration is blocked only when he
> > > > > > > > tries to migrate, and migrate_caps_check() will not block setting postcopy
> > > > > > > > when a VFIO device is already attached.
> > > > > > > > I don't have a strong opinion here, so if it's fine by you and everyone
> > > > > > > > else, I could change that to what you suggested.
> > > > > > > Failing later would be fine in this case to me; my expectation is VFIO
> > > > > > > users should be advanced already anyway (as the whole solution is still
> > > > > > > pretty involved comparing to a generic VM migration) and shouldn't try to
> > > > > > > trigger that at all in real life.  IOW I'd expect this check will be there
> > > > > > > just for sanity, rather than being relied on to let people be aware of it
> > > > > > > by the error message.
> > > > > > 
> > > > > > Yes, I agree with you.
> > > > > > 
> > > > > > > 
> > > > > > > Meanwhile the blocker + caps check is slightly complicated to me to guard
> > > > > > > both sides.  So I'd vote for failing at the QMP command. But we can wait
> > > > > > > and see whether there's other votes.
> > > > > > 
> > > > > > Sure.
> > > > > > So I will do the checking in vfio_save_setup(), unless someone else has a better idea.
> > > > > 
> > > > > Just to recap for my understanding,
> > > > > 
> > > > > vfio_save_setup() would test migrate_postcopy_ram() and update a new
> > > > > 'Error *err' parameter of the .save_setup() op which would be taken
> > > > > into account in qemu_savevm_state_setup(). Is that correct ?
> > > > > 
> > > > Yes.
> > > > But I wonder if it would be simpler to call migrate_set_error() directly from vfio_save_setup() instead of adding "Error *err" argument to .save_setup() and changing all other users.
> > > > What do you prefer?
> > > 
> > > Well, with my downstreamer hat, I would prefer a simpler solution for the
> > > VFIO postcopy limitation first. That said, there is value in adding
> > > a 'Error *' parameter to the .save_setup() op and letting the top routine
> > > qemu_savevm_state_setup() propagate. Other SaveVMhandler could start using
> > > it. even VFIO has multiple error_report() in vfio_save_setup() which could
> > > be propagated to the top callers.
> > > 
> > > Let's try that first. I will check your new series on top of 8.0
> > > 
> > OK, so just making sure, you want to add "Error *err" argument to .save_setup() first and see how it goes, right?
> 
> yes. Sorry. that was not clear.

I just remembered one pity of failing at save_setup() is it won't fail qmp
command "migrate" itself, but only reflected in query-migrate later.

If we want to make it even better (no strong opinion here, of now), we can
have it separate from save_setup(), e.g., SaveVMHandlers.save_prepare(), so
that it can be called even at migrate_prepare() and fail the QMP command
with proper errors.

Thanks,
Avihai Horon Aug. 30, 2023, 4:06 p.m. UTC | #13
On 30/08/2023 17:22, Peter Xu wrote:
> External email: Use caution opening links or attachments
>
>
> On Wed, Aug 30, 2023 at 01:17:55PM +0200, Cédric Le Goater wrote:
>> On 8/30/23 12:12, Avihai Horon wrote:
>>> On 30/08/2023 12:53, Cédric Le Goater wrote:
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> On 8/30/23 11:21, Avihai Horon wrote:
>>>>> On 30/08/2023 11:37, Cédric Le Goater wrote:
>>>>>> External email: Use caution opening links or attachments
>>>>>>
>>>>>>
>>>>>> On 8/30/23 09:01, Avihai Horon wrote:
>>>>>>> On 29/08/2023 21:27, Peter Xu wrote:
>>>>>>>> External email: Use caution opening links or attachments
>>>>>>>>
>>>>>>>>
>>>>>>>> On Tue, Aug 29, 2023 at 07:20:47PM +0300, Avihai Horon wrote:
>>>>>>>>> On 29/08/2023 17:53, Peter Xu wrote:
>>>>>>>>>> External email: Use caution opening links or attachments
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On Mon, Aug 28, 2023 at 06:18:41PM +0300, Avihai Horon wrote:
>>>>>>>>>>> diff --git a/migration/options.c b/migration/options.c
>>>>>>>>>>> index 1d1e1321b0..e201053563 100644
>>>>>>>>>>> --- a/migration/options.c
>>>>>>>>>>> +++ b/migration/options.c
>>>>>>>>>>> @@ -499,6 +499,11 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>>>>>>>>>>>                 error_setg(errp, "Postcopy is not yet compatible with multifd");
>>>>>>>>>>>                 return false;
>>>>>>>>>>>             }
>>>>>>>>>>> +
>>>>>>>>>>> +        if (migration_vfio_mig_active()) {
>>>>>>>>>>> +            error_setg(errp, "Postcopy is not compatible with VFIO migration");
>>>>>>>>>>> +            return false;
>>>>>>>>>>> +        }
>>>>>>>>>> Hmm.. this will add yet another vfio hard-coded line into migration/..
>>>>>>>>>>
>>>>>>>>>> What will happen if the vfio device is hot plugged after enabling
>>>>>>>>>> postcopy-ram here?
>>>>>>>>> In that case a migration blocker will be added.
>>>>>>>>>
>>>>>>>>>> Is it possible to do it in a generic way?
>>>>>>>>> What comes to my mind is to let devices register a handler for a "caps
>>>>>>>>> change" notification and allow them to object.
>>>>>>>>> But maybe that's a bit of an overkill.
>>>>>>>> This one also sounds better than hard-codes to me.
>>>>>>>>
>>>>>>>>>> I was thinking the only unified place to do such check is when migration
>>>>>>>>>> starts, as long as we switch to SETUP all caps are locked and doesn't allow
>>>>>>>>>> any change until it finishes or fails.
>>>>>>>>>>
>>>>>>>>>> So, can we do this check inside vfio_save_setup(), allow vfio_save_setup()
>>>>>>>>>> to fail the whole migration early?  For example, maybe we should have an
>>>>>>>>>> Error** passed in, then if it fails it calls migrate_set_error, so
>>>>>>>>>> reflected in query-migrate later too.
>>>>>>>>> Yes, I think this could work and it will simplify things because we could
>>>>>>>>> also drop the VFIO migration blockers code.
>>>>>>>>> The downside is that the user will know migration is blocked only when he
>>>>>>>>> tries to migrate, and migrate_caps_check() will not block setting postcopy
>>>>>>>>> when a VFIO device is already attached.
>>>>>>>>> I don't have a strong opinion here, so if it's fine by you and everyone
>>>>>>>>> else, I could change that to what you suggested.
>>>>>>>> Failing later would be fine in this case to me; my expectation is VFIO
>>>>>>>> users should be advanced already anyway (as the whole solution is still
>>>>>>>> pretty involved comparing to a generic VM migration) and shouldn't try to
>>>>>>>> trigger that at all in real life.  IOW I'd expect this check will be there
>>>>>>>> just for sanity, rather than being relied on to let people be aware of it
>>>>>>>> by the error message.
>>>>>>> Yes, I agree with you.
>>>>>>>
>>>>>>>> Meanwhile the blocker + caps check is slightly complicated to me to guard
>>>>>>>> both sides.  So I'd vote for failing at the QMP command. But we can wait
>>>>>>>> and see whether there's other votes.
>>>>>>> Sure.
>>>>>>> So I will do the checking in vfio_save_setup(), unless someone else has a better idea.
>>>>>> Just to recap for my understanding,
>>>>>>
>>>>>> vfio_save_setup() would test migrate_postcopy_ram() and update a new
>>>>>> 'Error *err' parameter of the .save_setup() op which would be taken
>>>>>> into account in qemu_savevm_state_setup(). Is that correct ?
>>>>>>
>>>>> Yes.
>>>>> But I wonder if it would be simpler to call migrate_set_error() directly from vfio_save_setup() instead of adding "Error *err" argument to .save_setup() and changing all other users.
>>>>> What do you prefer?
>>>> Well, with my downstreamer hat, I would prefer a simpler solution for the
>>>> VFIO postcopy limitation first. That said, there is value in adding
>>>> a 'Error *' parameter to the .save_setup() op and letting the top routine
>>>> qemu_savevm_state_setup() propagate. Other SaveVMhandler could start using
>>>> it. even VFIO has multiple error_report() in vfio_save_setup() which could
>>>> be propagated to the top callers.
>>>>
>>>> Let's try that first. I will check your new series on top of 8.0
>>>>
>>> OK, so just making sure, you want to add "Error *err" argument to .save_setup() first and see how it goes, right?
>> yes. Sorry. that was not clear.
> I just remembered one pity of failing at save_setup() is it won't fail qmp
> command "migrate" itself, but only reflected in query-migrate later.
>
> If we want to make it even better (no strong opinion here, of now), we can
> have it separate from save_setup(), e.g., SaveVMHandlers.save_prepare(), so
> that it can be called even at migrate_prepare() and fail the QMP command
> with proper errors.

Sounds good.
I will do it and send v2, hopefully tomorrow.

Thanks.
diff mbox series

Patch

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index e9b8954595..c0b58f2bb7 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -227,6 +227,8 @@  extern VFIOGroupList vfio_group_list;
 bool vfio_mig_active(void);
 int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp);
 void vfio_unblock_multiple_devices_migration(void);
+int vfio_block_postcopy_migration(VFIODevice *vbasedev, Error **errp);
+void vfio_unblock_postcopy_migration(void);
 bool vfio_viommu_preset(VFIODevice *vbasedev);
 int64_t vfio_mig_bytes_transferred(void);
 void vfio_reset_bytes_transferred(void);
diff --git a/migration/migration.h b/migration/migration.h
index c5695de214..21a6423408 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -514,6 +514,8 @@  void migration_cancel(const Error *error);
 
 void migration_populate_vfio_info(MigrationInfo *info);
 void migration_reset_vfio_bytes_transferred(void);
+bool migration_vfio_mig_active(void);
+void migration_vfio_unblock_postcopy_migration(void);
 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
 
 #endif
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 373f6e5932..7461194b2b 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -40,6 +40,7 @@ 
 #include "trace.h"
 #include "qapi/error.h"
 #include "migration/migration.h"
+#include "migration/options.h"
 #include "migration/misc.h"
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
@@ -343,6 +344,7 @@  static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
                                  uint64_t size, ram_addr_t ram_addr);
 
 static Error *multiple_devices_migration_blocker;
+static Error *postcopy_migration_blocker;
 
 static unsigned int vfio_migratable_devices_num(void)
 {
@@ -427,6 +429,47 @@  void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }
 
+int vfio_block_postcopy_migration(VFIODevice *vbasedev, Error **errp)
+{
+    int ret;
+
+    if (!migrate_postcopy_ram()) {
+        return 0;
+    }
+
+    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
+        error_setg(errp,
+                   "VFIO migration is not compatible with postcopy migration");
+        return -EINVAL;
+    }
+
+    if (postcopy_migration_blocker) {
+        return 0;
+    }
+
+    error_setg(&postcopy_migration_blocker,
+               "VFIO migration is not compatible with postcopy migration");
+    ret = migrate_add_blocker(postcopy_migration_blocker, errp);
+    if (ret < 0) {
+        error_free(postcopy_migration_blocker);
+        postcopy_migration_blocker = NULL;
+    }
+
+    return ret;
+}
+
+void vfio_unblock_postcopy_migration(void)
+{
+    if (!postcopy_migration_blocker ||
+        (vfio_migratable_devices_num() && migrate_postcopy_ram())) {
+        return;
+    }
+
+    migrate_del_blocker(postcopy_migration_blocker);
+    error_free(postcopy_migration_blocker);
+    postcopy_migration_blocker = NULL;
+}
+
 bool vfio_mig_active(void)
 {
     return vfio_migratable_devices_num();
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 71855468fe..76406e9ae9 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -856,6 +856,7 @@  static void vfio_migration_deinit(VFIODevice *vbasedev)
     unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
     vfio_migration_free(vbasedev);
     vfio_unblock_multiple_devices_migration();
+    vfio_unblock_postcopy_migration();
 }
 
 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
@@ -939,6 +940,11 @@  bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
         goto out_deinit;
     }
 
+    ret = vfio_block_postcopy_migration(vbasedev, errp);
+    if (ret) {
+        goto out_deinit;
+    }
+
     if (vfio_viommu_preset(vbasedev)) {
         error_setg(&err, "%s: Migration is currently not supported "
                    "with vIOMMU enabled", vbasedev->name);
diff --git a/migration/options.c b/migration/options.c
index 1d1e1321b0..e201053563 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -499,6 +499,11 @@  bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
             error_setg(errp, "Postcopy is not yet compatible with multifd");
             return false;
         }
+
+        if (migration_vfio_mig_active()) {
+            error_setg(errp, "Postcopy is not compatible with VFIO migration");
+            return false;
+        }
     }
 
     if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
@@ -612,6 +617,16 @@  bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
     return true;
 }
 
+/*
+ * Devices might have added migration blockers based on migration capabilities
+ * values when those devices were added. Remove such blockers according to new
+ * changes in migration capabilities.
+ */
+static void migration_caps_remove_blockers(void)
+{
+    migration_vfio_unblock_postcopy_migration();
+}
+
 bool migrate_cap_set(int cap, bool value, Error **errp)
 {
     MigrationState *s = migrate_get_current();
@@ -629,6 +644,8 @@  bool migrate_cap_set(int cap, bool value, Error **errp)
         return false;
     }
     s->capabilities[cap] = value;
+    migration_caps_remove_blockers();
+
     return true;
 }
 
@@ -678,6 +695,8 @@  void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
     for (cap = params; cap; cap = cap->next) {
         s->capabilities[cap->value->capability] = cap->value->state;
     }
+
+    migration_caps_remove_blockers();
 }
 
 /* parameters */
diff --git a/migration/target.c b/migration/target.c
index a6ffa9a5ce..690ecb4dd5 100644
--- a/migration/target.c
+++ b/migration/target.c
@@ -27,6 +27,16 @@  void migration_reset_vfio_bytes_transferred(void)
 {
     vfio_reset_bytes_transferred();
 }
+
+bool migration_vfio_mig_active(void)
+{
+    return vfio_mig_active();
+}
+
+void migration_vfio_unblock_postcopy_migration(void)
+{
+    vfio_unblock_postcopy_migration();
+}
 #else
 void migration_populate_vfio_info(MigrationInfo *info)
 {
@@ -35,4 +45,13 @@  void migration_populate_vfio_info(MigrationInfo *info)
 void migration_reset_vfio_bytes_transferred(void)
 {
 }
+
+bool migration_vfio_mig_active(void)
+{
+    return false;
+}
+
+void migration_vfio_unblock_postcopy_migration()
+{
+}
 #endif