diff mbox series

[v2,2/2] qapi: introduce exit-on-error paramter for migrate-incoming

Message ID 20240424174245.1237942-3-vsementsov@yandex-team.ru
State New
Headers show
Series migration: do not exit on incoming failure | expand

Commit Message

Vladimir Sementsov-Ogievskiy April 24, 2024, 5:42 p.m. UTC
Now we do set MIGRATION_FAILED state, but don't give a chance to
orchestrator to query migration state and get the error.

Let's provide a possibility for QMP-based orchestrators to get an error
like with outgoing migration.

For x-exit-preconfig we can enable new behavior unconditionally, as
it's an unstable command.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
---
 migration/migration-hmp-cmds.c |  2 +-
 migration/migration.c          | 31 +++++++++++++++++++++----------
 migration/migration.h          |  3 +++
 qapi/migration.json            |  7 ++++++-
 system/vl.c                    |  7 +------
 5 files changed, 32 insertions(+), 18 deletions(-)

Comments

Peter Xu April 24, 2024, 10:02 p.m. UTC | #1
On Wed, Apr 24, 2024 at 08:42:45PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Now we do set MIGRATION_FAILED state, but don't give a chance to
> orchestrator to query migration state and get the error.
> 
> Let's provide a possibility for QMP-based orchestrators to get an error
> like with outgoing migration.
> 
> For x-exit-preconfig we can enable new behavior unconditionally, as
> it's an unstable command.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
> ---
>  migration/migration-hmp-cmds.c |  2 +-
>  migration/migration.c          | 31 +++++++++++++++++++++----------
>  migration/migration.h          |  3 +++
>  qapi/migration.json            |  7 ++++++-
>  system/vl.c                    |  7 +------
>  5 files changed, 32 insertions(+), 18 deletions(-)
> 
> diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
> index 7e96ae6ffd..9c94a18029 100644
> --- a/migration/migration-hmp-cmds.c
> +++ b/migration/migration-hmp-cmds.c
> @@ -466,7 +466,7 @@ void hmp_migrate_incoming(Monitor *mon, const QDict *qdict)
>      }
>      QAPI_LIST_PREPEND(caps, g_steal_pointer(&channel));
>  
> -    qmp_migrate_incoming(NULL, true, caps, &err);
> +    qmp_migrate_incoming(NULL, true, caps, true, true, &err);

Hmm, to me HMP implies more on "this is a developer, and he/she can be
debugging stuff", in that case I'm thinking whether this would be a good
chance to let exit=false already?  So that developers can keep the qemu
instances when error occurs.

IOW, I'd think it's fine to break HMP as that's not an ABI, so we choose
whatever makes sense to us.

>      qapi_free_MigrationChannelList(caps);
>  
>  end:
> diff --git a/migration/migration.c b/migration/migration.c
> index 806b7b080b..fe72529ba7 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -234,6 +234,8 @@ void migration_object_init(void)
>      qemu_cond_init(&current_incoming->page_request_cond);
>      current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
>  
> +    current_incoming->exit_on_error = true;

Let's define a macro for the default value?  Then we can use it below too [1].

> +
>      migration_object_check(current_migration, &error_fatal);
>  
>      blk_mig_init();
> @@ -597,12 +599,15 @@ bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
>  
>  static void qemu_start_incoming_migration(const char *uri, bool has_channels,
>                                            MigrationChannelList *channels,
> +                                          bool exit_on_error,
>                                            Error **errp)
>  {
>      g_autoptr(MigrationChannel) channel = NULL;
>      MigrationAddress *addr = NULL;
>      MigrationIncomingState *mis = migration_incoming_get_current();
>  
> +    mis->exit_on_error = exit_on_error;

Maybe it's better to set this in qmp_migrate_incoming() directly?  As then
we guarantee qmp_migrate_recover() won't ever try to touch it.

> +
>      /*
>       * Having preliminary checks for uri and channel
>       */
> @@ -738,11 +743,12 @@ process_incoming_migration_co(void *opaque)
>      MigrationIncomingState *mis = migration_incoming_get_current();
>      PostcopyState ps;
>      int ret;
> +    Error *local_err = NULL;
>  
>      assert(mis->from_src_file);
>  
>      if (compress_threads_load_setup(mis->from_src_file)) {
> -        error_report("Failed to setup decompress threads");
> +        error_setg(&local_err, "Failed to setup decompress threads");
>          goto fail;
>      }
>  
> @@ -779,25 +785,26 @@ process_incoming_migration_co(void *opaque)
>      }
>  
>      if (ret < 0) {
> -        error_report("load of migration failed: %s", strerror(-ret));
> +        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
>          goto fail;
>      }
>  
>      if (colo_incoming_co() < 0) {
> +        error_setg(&local_err, "colo incoming failed");
>          goto fail;
>      }
>  
>      migration_bh_schedule(process_incoming_migration_bh, mis);
>      return;
>  fail:
> +    migrate_report_err(local_err);
>      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                        MIGRATION_STATUS_FAILED);
> -    qemu_fclose(mis->from_src_file);
> -
> -    multifd_recv_cleanup();
> -    compress_threads_load_cleanup();
> +    migration_incoming_state_destroy();

IMHO it'll be good to split such change into another patch.

>  
> -    exit(EXIT_FAILURE);
> +    if (mis->exit_on_error) {
> +        exit(EXIT_FAILURE);
> +    }

Instead of migrate_report_err(), I'd error_report() here.

if (exit_on_error) {
    // error_report...
    exit();
}

And perhaps dump nothing if don't exit?  Then the user/developer can check
using info/query migrate.

>  }
>  
>  /**
> @@ -1785,7 +1792,9 @@ void migrate_del_blocker(Error **reasonp)
>  }
>  
>  void qmp_migrate_incoming(const char *uri, bool has_channels,
> -                          MigrationChannelList *channels, Error **errp)
> +                          MigrationChannelList *channels,
> +                          bool has_exit_on_error, bool exit_on_error,
> +                          Error **errp)
>  {
>      Error *local_err = NULL;
>      static bool once = true;
> @@ -1803,7 +1812,9 @@ void qmp_migrate_incoming(const char *uri, bool has_channels,
>          return;
>      }
>  
> -    qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
> +    qemu_start_incoming_migration(uri, has_channels, channels,
> +                                  has_exit_on_error ? exit_on_error : true,

[1]

> +                                  &local_err);
>  
>      if (local_err) {
>          yank_unregister_instance(MIGRATION_YANK_INSTANCE);
> @@ -1839,7 +1850,7 @@ void qmp_migrate_recover(const char *uri, Error **errp)
>       * only re-setup the migration stream and poke existing migration
>       * to continue using that newly established channel.
>       */
> -    qemu_start_incoming_migration(uri, false, NULL, errp);
> +    qemu_start_incoming_migration(uri, false, NULL, mis->exit_on_error, errp);

(if we set it in qmp_migrate_incoming, we shouldn't need this change)

Thanks,

>  }
>  
>  void qmp_migrate_pause(Error **errp)
> diff --git a/migration/migration.h b/migration/migration.h
> index f9f436c33e..089f710ee1 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -227,6 +227,9 @@ struct MigrationIncomingState {
>       * is needed as this field is updated serially.
>       */
>      unsigned int switchover_ack_pending_num;
> +
> +    /* Do exit on incoming migration failure */
> +    bool exit_on_error;
>  };
>  
>  MigrationIncomingState *migration_incoming_get_current(void);
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 8c65b90328..9de8b98d0b 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -1837,6 +1837,10 @@
>  # @channels: list of migration stream channels with each stream in the
>  #     list connected to a destination interface endpoint.
>  #
> +# @exit-on-error: Do exit on incoming migration failure.  Default true.
> +#     When set to false, the error is reported by MIGRATION event and
> +#     error could be retrieved by query-migrate command.  (since 9.1)
> +#
>  # Since: 2.3
>  #
>  # Notes:
> @@ -1889,7 +1893,8 @@
>  ##
>  { 'command': 'migrate-incoming',
>               'data': {'*uri': 'str',
> -                      '*channels': [ 'MigrationChannel' ] } }
> +                      '*channels': [ 'MigrationChannel' ],
> +                      '*exit-on-error': 'bool' } }
>  
>  ##
>  # @xen-save-devices-state:
> diff --git a/system/vl.c b/system/vl.c
> index c644222982..f5b4c18200 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -2718,13 +2718,8 @@ void qmp_x_exit_preconfig(Error **errp)
>      }
>  
>      if (incoming) {
> -        Error *local_err = NULL;
>          if (strcmp(incoming, "defer") != 0) {
> -            qmp_migrate_incoming(incoming, false, NULL, &local_err);
> -            if (local_err) {
> -                error_reportf_err(local_err, "-incoming %s: ", incoming);
> -                exit(1);
> -            }
> +            qmp_migrate_incoming(incoming, false, NULL, true, true, errp);
>          }
>      } else if (autostart) {
>          qmp_cont(NULL);
> -- 
> 2.34.1
>
diff mbox series

Patch

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 7e96ae6ffd..9c94a18029 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -466,7 +466,7 @@  void hmp_migrate_incoming(Monitor *mon, const QDict *qdict)
     }
     QAPI_LIST_PREPEND(caps, g_steal_pointer(&channel));
 
-    qmp_migrate_incoming(NULL, true, caps, &err);
+    qmp_migrate_incoming(NULL, true, caps, true, true, &err);
     qapi_free_MigrationChannelList(caps);
 
 end:
diff --git a/migration/migration.c b/migration/migration.c
index 806b7b080b..fe72529ba7 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -234,6 +234,8 @@  void migration_object_init(void)
     qemu_cond_init(&current_incoming->page_request_cond);
     current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
 
+    current_incoming->exit_on_error = true;
+
     migration_object_check(current_migration, &error_fatal);
 
     blk_mig_init();
@@ -597,12 +599,15 @@  bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
 
 static void qemu_start_incoming_migration(const char *uri, bool has_channels,
                                           MigrationChannelList *channels,
+                                          bool exit_on_error,
                                           Error **errp)
 {
     g_autoptr(MigrationChannel) channel = NULL;
     MigrationAddress *addr = NULL;
     MigrationIncomingState *mis = migration_incoming_get_current();
 
+    mis->exit_on_error = exit_on_error;
+
     /*
      * Having preliminary checks for uri and channel
      */
@@ -738,11 +743,12 @@  process_incoming_migration_co(void *opaque)
     MigrationIncomingState *mis = migration_incoming_get_current();
     PostcopyState ps;
     int ret;
+    Error *local_err = NULL;
 
     assert(mis->from_src_file);
 
     if (compress_threads_load_setup(mis->from_src_file)) {
-        error_report("Failed to setup decompress threads");
+        error_setg(&local_err, "Failed to setup decompress threads");
         goto fail;
     }
 
@@ -779,25 +785,26 @@  process_incoming_migration_co(void *opaque)
     }
 
     if (ret < 0) {
-        error_report("load of migration failed: %s", strerror(-ret));
+        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
         goto fail;
     }
 
     if (colo_incoming_co() < 0) {
+        error_setg(&local_err, "colo incoming failed");
         goto fail;
     }
 
     migration_bh_schedule(process_incoming_migration_bh, mis);
     return;
 fail:
+    migrate_report_err(local_err);
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_FAILED);
-    qemu_fclose(mis->from_src_file);
-
-    multifd_recv_cleanup();
-    compress_threads_load_cleanup();
+    migration_incoming_state_destroy();
 
-    exit(EXIT_FAILURE);
+    if (mis->exit_on_error) {
+        exit(EXIT_FAILURE);
+    }
 }
 
 /**
@@ -1785,7 +1792,9 @@  void migrate_del_blocker(Error **reasonp)
 }
 
 void qmp_migrate_incoming(const char *uri, bool has_channels,
-                          MigrationChannelList *channels, Error **errp)
+                          MigrationChannelList *channels,
+                          bool has_exit_on_error, bool exit_on_error,
+                          Error **errp)
 {
     Error *local_err = NULL;
     static bool once = true;
@@ -1803,7 +1812,9 @@  void qmp_migrate_incoming(const char *uri, bool has_channels,
         return;
     }
 
-    qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
+    qemu_start_incoming_migration(uri, has_channels, channels,
+                                  has_exit_on_error ? exit_on_error : true,
+                                  &local_err);
 
     if (local_err) {
         yank_unregister_instance(MIGRATION_YANK_INSTANCE);
@@ -1839,7 +1850,7 @@  void qmp_migrate_recover(const char *uri, Error **errp)
      * only re-setup the migration stream and poke existing migration
      * to continue using that newly established channel.
      */
-    qemu_start_incoming_migration(uri, false, NULL, errp);
+    qemu_start_incoming_migration(uri, false, NULL, mis->exit_on_error, errp);
 }
 
 void qmp_migrate_pause(Error **errp)
diff --git a/migration/migration.h b/migration/migration.h
index f9f436c33e..089f710ee1 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -227,6 +227,9 @@  struct MigrationIncomingState {
      * is needed as this field is updated serially.
      */
     unsigned int switchover_ack_pending_num;
+
+    /* Do exit on incoming migration failure */
+    bool exit_on_error;
 };
 
 MigrationIncomingState *migration_incoming_get_current(void);
diff --git a/qapi/migration.json b/qapi/migration.json
index 8c65b90328..9de8b98d0b 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1837,6 +1837,10 @@ 
 # @channels: list of migration stream channels with each stream in the
 #     list connected to a destination interface endpoint.
 #
+# @exit-on-error: Do exit on incoming migration failure.  Default true.
+#     When set to false, the error is reported by MIGRATION event and
+#     error could be retrieved by query-migrate command.  (since 9.1)
+#
 # Since: 2.3
 #
 # Notes:
@@ -1889,7 +1893,8 @@ 
 ##
 { 'command': 'migrate-incoming',
              'data': {'*uri': 'str',
-                      '*channels': [ 'MigrationChannel' ] } }
+                      '*channels': [ 'MigrationChannel' ],
+                      '*exit-on-error': 'bool' } }
 
 ##
 # @xen-save-devices-state:
diff --git a/system/vl.c b/system/vl.c
index c644222982..f5b4c18200 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -2718,13 +2718,8 @@  void qmp_x_exit_preconfig(Error **errp)
     }
 
     if (incoming) {
-        Error *local_err = NULL;
         if (strcmp(incoming, "defer") != 0) {
-            qmp_migrate_incoming(incoming, false, NULL, &local_err);
-            if (local_err) {
-                error_reportf_err(local_err, "-incoming %s: ", incoming);
-                exit(1);
-            }
+            qmp_migrate_incoming(incoming, false, NULL, true, true, errp);
         }
     } else if (autostart) {
         qmp_cont(NULL);