diff mbox series

[v3,4/4] qapi: introduce exit-on-error parameter for migrate-incoming

Message ID 20240425181551.1465739-5-vsementsov@yandex-team.ru
State New
Headers show
Series migration: do not exit on incoming failure | expand

Commit Message

Vladimir Sementsov-Ogievskiy April 25, 2024, 6:15 p.m. UTC
Now we do set MIGRATION_FAILED state, but don't give a chance to
orchestrator to query migration state and get the error.

Let's provide a possibility for QMP-based orchestrators to get an error
like with outgoing migration.

For hmp_migrate_incoming(), let's enable the new behavior: HMP is not
and ABI, it's mostly intended to use by developer and it makes sense
not to stop the process.

For x-exit-preconfig, let's keep the old behavior:
 - it's called from init(), so here we want to keep current behavior by
   default
 - it does exit on error by itself as well
So, if we want to change the behavior of x-exit-preconfig, it should be
another patch.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
---
 migration/migration-hmp-cmds.c |  2 +-
 migration/migration.c          | 36 ++++++++++++++++++++++++++++------
 migration/migration.h          |  3 +++
 qapi/migration.json            |  7 ++++++-
 system/vl.c                    |  3 ++-
 5 files changed, 42 insertions(+), 9 deletions(-)

Comments

Fabiano Rosas April 25, 2024, 8:30 p.m. UTC | #1
Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> writes:

> Now we do set MIGRATION_FAILED state, but don't give a chance to
> orchestrator to query migration state and get the error.
>
> Let's provide a possibility for QMP-based orchestrators to get an error
> like with outgoing migration.
>
> For hmp_migrate_incoming(), let's enable the new behavior: HMP is not
> and ABI, it's mostly intended to use by developer and it makes sense
> not to stop the process.
>
> For x-exit-preconfig, let's keep the old behavior:
>  - it's called from init(), so here we want to keep current behavior by
>    default
>  - it does exit on error by itself as well
> So, if we want to change the behavior of x-exit-preconfig, it should be
> another patch.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
> ---
>  migration/migration-hmp-cmds.c |  2 +-
>  migration/migration.c          | 36 ++++++++++++++++++++++++++++------
>  migration/migration.h          |  3 +++
>  qapi/migration.json            |  7 ++++++-
>  system/vl.c                    |  3 ++-
>  5 files changed, 42 insertions(+), 9 deletions(-)
>
> diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
> index 7e96ae6ffd..23181bbee1 100644
> --- a/migration/migration-hmp-cmds.c
> +++ b/migration/migration-hmp-cmds.c
> @@ -466,7 +466,7 @@ void hmp_migrate_incoming(Monitor *mon, const QDict *qdict)
>      }
>      QAPI_LIST_PREPEND(caps, g_steal_pointer(&channel));
>  
> -    qmp_migrate_incoming(NULL, true, caps, &err);
> +    qmp_migrate_incoming(NULL, true, caps, true, false, &err);
>      qapi_free_MigrationChannelList(caps);
>  
>  end:
> diff --git a/migration/migration.c b/migration/migration.c
> index 0b15f7ccf4..5cfe420a76 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -72,6 +72,8 @@
>  #define NOTIFIER_ELEM_INIT(array, elem)    \
>      [elem] = NOTIFIER_WITH_RETURN_LIST_INITIALIZER((array)[elem])
>  
> +#define INMIGRATE_DEFAULT_EXIT_ON_ERROR true
> +
>  static NotifierWithReturnList migration_state_notifiers[] = {
>      NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL),
>      NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT),
> @@ -234,6 +236,8 @@ void migration_object_init(void)
>      qemu_cond_init(&current_incoming->page_request_cond);
>      current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
>  
> +    current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
> +
>      migration_object_check(current_migration, &error_fatal);
>  
>      blk_mig_init();
> @@ -797,13 +801,18 @@ fail:
>                        MIGRATION_STATUS_FAILED);
>      migration_incoming_state_destroy();
>  
> -    if (migrate_has_error(s)) {
> -        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
> -            error_report_err(s->error);
> +    if (mis->exit_on_error) {
> +        if (migrate_has_error(s)) {
> +            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
> +                error_report_err(s->error);

error_report_err(error_copy(s->error))

...because later on you're reading from s->error at
fill_destination_migration_info.

> +            }
>          }
> +        error_report_err(local_err);

migrate_error_free(s);

> +        exit(EXIT_FAILURE);
> +    } else {
> +        migrate_set_error(s, local_err);
> +        error_free(local_err);
>      }
> -    error_report_err(local_err);
> -    exit(EXIT_FAILURE);
>  }
>  
>  /**
> @@ -1312,6 +1321,15 @@ static void fill_destination_migration_info(MigrationInfo *info)
>          break;
>      }
>      info->status = mis->state;
> +
> +    if (!info->error_desc) {
> +        MigrationState *s = migrate_get_current();
> +        QEMU_LOCK_GUARD(&s->error_mutex);
> +
> +        if (s->error) {
> +            info->error_desc = g_strdup(error_get_pretty(s->error));
> +        }
> +    }
>  }
>  
>  MigrationInfo *qmp_query_migrate(Error **errp)
> @@ -1795,10 +1813,13 @@ void migrate_del_blocker(Error **reasonp)
>  }
>  
>  void qmp_migrate_incoming(const char *uri, bool has_channels,
> -                          MigrationChannelList *channels, Error **errp)
> +                          MigrationChannelList *channels,
> +                          bool has_exit_on_error, bool exit_on_error,
> +                          Error **errp)
>  {
>      Error *local_err = NULL;
>      static bool once = true;
> +    MigrationIncomingState *mis = migration_incoming_get_current();
>  
>      if (!once) {
>          error_setg(errp, "The incoming migration has already been started");
> @@ -1813,6 +1834,9 @@ void qmp_migrate_incoming(const char *uri, bool has_channels,
>          return;
>      }
>  
> +    mis->exit_on_error =
> +        has_exit_on_error ? exit_on_error : INMIGRATE_DEFAULT_EXIT_ON_ERROR;
> +
>      qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
>  
>      if (local_err) {
> diff --git a/migration/migration.h b/migration/migration.h
> index 8045e39c26..95995a818e 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -227,6 +227,9 @@ struct MigrationIncomingState {
>       * is needed as this field is updated serially.
>       */
>      unsigned int switchover_ack_pending_num;
> +
> +    /* Do exit on incoming migration failure */
> +    bool exit_on_error;
>  };
>  
>  MigrationIncomingState *migration_incoming_get_current(void);
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 8c65b90328..9de8b98d0b 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -1837,6 +1837,10 @@
>  # @channels: list of migration stream channels with each stream in the
>  #     list connected to a destination interface endpoint.
>  #
> +# @exit-on-error: Do exit on incoming migration failure.  Default true.
> +#     When set to false, the error is reported by MIGRATION event and
> +#     error could be retrieved by query-migrate command.  (since 9.1)
> +#
>  # Since: 2.3
>  #
>  # Notes:
> @@ -1889,7 +1893,8 @@
>  ##
>  { 'command': 'migrate-incoming',
>               'data': {'*uri': 'str',
> -                      '*channels': [ 'MigrationChannel' ] } }
> +                      '*channels': [ 'MigrationChannel' ],
> +                      '*exit-on-error': 'bool' } }
>  
>  ##
>  # @xen-save-devices-state:
> diff --git a/system/vl.c b/system/vl.c
> index c644222982..3bad81b0a4 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -2720,7 +2720,8 @@ void qmp_x_exit_preconfig(Error **errp)
>      if (incoming) {
>          Error *local_err = NULL;
>          if (strcmp(incoming, "defer") != 0) {
> -            qmp_migrate_incoming(incoming, false, NULL, &local_err);
> +            qmp_migrate_incoming(incoming, false, NULL, true, true,
> +                                 &local_err);
>              if (local_err) {
>                  error_reportf_err(local_err, "-incoming %s: ", incoming);
>                  exit(1);
Markus Armbruster April 26, 2024, 5:17 a.m. UTC | #2
Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> writes:

> Now we do set MIGRATION_FAILED state, but don't give a chance to
> orchestrator to query migration state and get the error.
>
> Let's provide a possibility for QMP-based orchestrators to get an error
> like with outgoing migration.
>
> For hmp_migrate_incoming(), let's enable the new behavior: HMP is not
> and ABI, it's mostly intended to use by developer and it makes sense
> not to stop the process.
>
> For x-exit-preconfig, let's keep the old behavior:
>  - it's called from init(), so here we want to keep current behavior by
>    default
>  - it does exit on error by itself as well
> So, if we want to change the behavior of x-exit-preconfig, it should be
> another patch.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>

[...]

> diff --git a/qapi/migration.json b/qapi/migration.json
> index 8c65b90328..9de8b98d0b 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -1837,6 +1837,10 @@
>  # @channels: list of migration stream channels with each stream in the
>  #     list connected to a destination interface endpoint.
>  #
> +# @exit-on-error: Do exit on incoming migration failure.  Default true.

Scratch "Do"?

> +#     When set to false, the error is reported by MIGRATION event and

Comma before "and", please.

Suggest "the failure triggers a MIGRATION event".

> +#     error could be retrieved by query-migrate command.  (since 9.1)

"error details could be retrieved with query-migrate", perhaps?

> +#
>  # Since: 2.3
>  #
>  # Notes:
> @@ -1889,7 +1893,8 @@
>  ##
>  { 'command': 'migrate-incoming',
>               'data': {'*uri': 'str',
> -                      '*channels': [ 'MigrationChannel' ] } }
> +                      '*channels': [ 'MigrationChannel' ],
> +                      '*exit-on-error': 'bool' } }
>  
>  ##
>  # @xen-save-devices-state:

QAPI schema
Acked-by: Markus Armbruster <armbru@redhat.com>

[...]
Vladimir Sementsov-Ogievskiy April 29, 2024, 7:45 a.m. UTC | #3
On 25.04.24 23:30, Fabiano Rosas wrote:
>> @@ -797,13 +801,18 @@ fail:
>>                         MIGRATION_STATUS_FAILED);
>>       migration_incoming_state_destroy();
>>   
>> -    if (migrate_has_error(s)) {
>> -        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>> -            error_report_err(s->error);
>> +    if (mis->exit_on_error) {
>> +        if (migrate_has_error(s)) {
>> +            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>> +                error_report_err(s->error);
> error_report_err(error_copy(s->error))
> 
> ...because later on you're reading from s->error at
> fill_destination_migration_info.

No, we immediately do exit() instead. That's just a preexisting behavior, moved into "if (mis->exit_on_error)"
Fabiano Rosas April 29, 2024, 1:06 p.m. UTC | #4
Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> writes:

> On 25.04.24 23:30, Fabiano Rosas wrote:
>>> @@ -797,13 +801,18 @@ fail:
>>>                         MIGRATION_STATUS_FAILED);
>>>       migration_incoming_state_destroy();
>>>   
>>> -    if (migrate_has_error(s)) {
>>> -        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>>> -            error_report_err(s->error);
>>> +    if (mis->exit_on_error) {
>>> +        if (migrate_has_error(s)) {
>>> +            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>>> +                error_report_err(s->error);
>> error_report_err(error_copy(s->error))
>> 
>> ...because later on you're reading from s->error at
>> fill_destination_migration_info.
>
> No, we immediately do exit() instead. That's just a preexisting behavior, moved into "if (mis->exit_on_error)"

I meant later in the patch, not later in the execution. Can't
query-migrate be called during process_incoming_migration_co?
Vladimir Sementsov-Ogievskiy April 29, 2024, 3:03 p.m. UTC | #5
On 29.04.24 16:06, Fabiano Rosas wrote:
> Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> writes:
> 
>> On 25.04.24 23:30, Fabiano Rosas wrote:
>>>> @@ -797,13 +801,18 @@ fail:
>>>>                          MIGRATION_STATUS_FAILED);
>>>>        migration_incoming_state_destroy();
>>>>    
>>>> -    if (migrate_has_error(s)) {
>>>> -        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>>>> -            error_report_err(s->error);
>>>> +    if (mis->exit_on_error) {
>>>> +        if (migrate_has_error(s)) {
>>>> +            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>>>> +                error_report_err(s->error);
>>> error_report_err(error_copy(s->error))
>>>
>>> ...because later on you're reading from s->error at
>>> fill_destination_migration_info.
>>
>> No, we immediately do exit() instead. That's just a preexisting behavior, moved into "if (mis->exit_on_error)"
> 
> I meant later in the patch, not later in the execution. Can't
> query-migrate be called during process_incoming_migration_co?

Hmm.. On the one hand, seems no reason to care about it exactly before exit().. On the other hand, we do care about taking error_mutex. And we do release it, which may trigger another critical section.

I'll try to touch up this thing.
diff mbox series

Patch

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 7e96ae6ffd..23181bbee1 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -466,7 +466,7 @@  void hmp_migrate_incoming(Monitor *mon, const QDict *qdict)
     }
     QAPI_LIST_PREPEND(caps, g_steal_pointer(&channel));
 
-    qmp_migrate_incoming(NULL, true, caps, &err);
+    qmp_migrate_incoming(NULL, true, caps, true, false, &err);
     qapi_free_MigrationChannelList(caps);
 
 end:
diff --git a/migration/migration.c b/migration/migration.c
index 0b15f7ccf4..5cfe420a76 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -72,6 +72,8 @@ 
 #define NOTIFIER_ELEM_INIT(array, elem)    \
     [elem] = NOTIFIER_WITH_RETURN_LIST_INITIALIZER((array)[elem])
 
+#define INMIGRATE_DEFAULT_EXIT_ON_ERROR true
+
 static NotifierWithReturnList migration_state_notifiers[] = {
     NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL),
     NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT),
@@ -234,6 +236,8 @@  void migration_object_init(void)
     qemu_cond_init(&current_incoming->page_request_cond);
     current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
 
+    current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
+
     migration_object_check(current_migration, &error_fatal);
 
     blk_mig_init();
@@ -797,13 +801,18 @@  fail:
                       MIGRATION_STATUS_FAILED);
     migration_incoming_state_destroy();
 
-    if (migrate_has_error(s)) {
-        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
-            error_report_err(s->error);
+    if (mis->exit_on_error) {
+        if (migrate_has_error(s)) {
+            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
+                error_report_err(s->error);
+            }
         }
+        error_report_err(local_err);
+        exit(EXIT_FAILURE);
+    } else {
+        migrate_set_error(s, local_err);
+        error_free(local_err);
     }
-    error_report_err(local_err);
-    exit(EXIT_FAILURE);
 }
 
 /**
@@ -1312,6 +1321,15 @@  static void fill_destination_migration_info(MigrationInfo *info)
         break;
     }
     info->status = mis->state;
+
+    if (!info->error_desc) {
+        MigrationState *s = migrate_get_current();
+        QEMU_LOCK_GUARD(&s->error_mutex);
+
+        if (s->error) {
+            info->error_desc = g_strdup(error_get_pretty(s->error));
+        }
+    }
 }
 
 MigrationInfo *qmp_query_migrate(Error **errp)
@@ -1795,10 +1813,13 @@  void migrate_del_blocker(Error **reasonp)
 }
 
 void qmp_migrate_incoming(const char *uri, bool has_channels,
-                          MigrationChannelList *channels, Error **errp)
+                          MigrationChannelList *channels,
+                          bool has_exit_on_error, bool exit_on_error,
+                          Error **errp)
 {
     Error *local_err = NULL;
     static bool once = true;
+    MigrationIncomingState *mis = migration_incoming_get_current();
 
     if (!once) {
         error_setg(errp, "The incoming migration has already been started");
@@ -1813,6 +1834,9 @@  void qmp_migrate_incoming(const char *uri, bool has_channels,
         return;
     }
 
+    mis->exit_on_error =
+        has_exit_on_error ? exit_on_error : INMIGRATE_DEFAULT_EXIT_ON_ERROR;
+
     qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
 
     if (local_err) {
diff --git a/migration/migration.h b/migration/migration.h
index 8045e39c26..95995a818e 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -227,6 +227,9 @@  struct MigrationIncomingState {
      * is needed as this field is updated serially.
      */
     unsigned int switchover_ack_pending_num;
+
+    /* Do exit on incoming migration failure */
+    bool exit_on_error;
 };
 
 MigrationIncomingState *migration_incoming_get_current(void);
diff --git a/qapi/migration.json b/qapi/migration.json
index 8c65b90328..9de8b98d0b 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1837,6 +1837,10 @@ 
 # @channels: list of migration stream channels with each stream in the
 #     list connected to a destination interface endpoint.
 #
+# @exit-on-error: Do exit on incoming migration failure.  Default true.
+#     When set to false, the error is reported by MIGRATION event and
+#     error could be retrieved by query-migrate command.  (since 9.1)
+#
 # Since: 2.3
 #
 # Notes:
@@ -1889,7 +1893,8 @@ 
 ##
 { 'command': 'migrate-incoming',
              'data': {'*uri': 'str',
-                      '*channels': [ 'MigrationChannel' ] } }
+                      '*channels': [ 'MigrationChannel' ],
+                      '*exit-on-error': 'bool' } }
 
 ##
 # @xen-save-devices-state:
diff --git a/system/vl.c b/system/vl.c
index c644222982..3bad81b0a4 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -2720,7 +2720,8 @@  void qmp_x_exit_preconfig(Error **errp)
     if (incoming) {
         Error *local_err = NULL;
         if (strcmp(incoming, "defer") != 0) {
-            qmp_migrate_incoming(incoming, false, NULL, &local_err);
+            qmp_migrate_incoming(incoming, false, NULL, true, true,
+                                 &local_err);
             if (local_err) {
                 error_reportf_err(local_err, "-incoming %s: ", incoming);
                 exit(1);