diff mbox

[RFC,6/6] migration: enable return path for precopy

Message ID 1495176212-14446-7-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu May 19, 2017, 6:43 a.m. UTC
Let this be a flag, default to on. Turn it off for <=2.9 versions.

After this patch, return path will be on even for pre-copy migration as
long as the transport support, e.g., for socket typed transport
including "tcp|udp|unix" typed.

This will naturally fix the bug mentioned below, when destination failed
on migration but source assumed it was successful - since now even for
precopy, source will wait for destination's MIG_RP_MSG_SHUT signal,
which will carry the final migration status of destination. Then, when
destination failed at any point of migration, source will know it, and
it'll resume the VM instead of a data lost.

Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1439147
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/hw/compat.h           |  4 ++++
 include/migration/migration.h |  3 +++
 migration/migration.c         | 15 ++++++++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

Comments

Juan Quintela May 30, 2017, 3:59 p.m. UTC | #1
Peter Xu <peterx@redhat.com> wrote:
> Let this be a flag, default to on. Turn it off for <=2.9 versions.
>
> After this patch, return path will be on even for pre-copy migration as
> long as the transport support, e.g., for socket typed transport
> including "tcp|udp|unix" typed.
>
> This will naturally fix the bug mentioned below, when destination failed
> on migration but source assumed it was successful - since now even for
> precopy, source will wait for destination's MIG_RP_MSG_SHUT signal,
> which will carry the final migration status of destination. Then, when
> destination failed at any point of migration, source will know it, and
> it'll resume the VM instead of a data lost.
>
> Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1439147
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/hw/compat.h           |  4 ++++
>  include/migration/migration.h |  3 +++
>  migration/migration.c         | 15 ++++++++++++++-
>  3 files changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/include/hw/compat.h b/include/hw/compat.h
> index 55b1765..049457b 100644
> --- a/include/hw/compat.h
> +++ b/include/hw/compat.h
> @@ -6,6 +6,10 @@
>          .driver   = "pci-bridge",\
>          .property = "shpc",\
>          .value    = "off",\
> +    },{\
> +        .driver   = "migration",\
> +        .property = "return-path",\
> +        .value    = "off",\
>      },
>  
>  #define HW_COMPAT_2_8 \
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index 70710de..e44119c 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -169,6 +169,9 @@ typedef struct MigrationState {
>      int64_t colo_checkpoint_time;
>      QEMUTimer *colo_delay_timer;
>  
> +    /* Whether to try to enable return-path even for pre-copy */
> +    bool enable_return_path;
> +
>      /* The last error that occurred */
>      Error *error;
>  } MigrationState ;
> diff --git a/migration/migration.c b/migration/migration.c
> index 6df3483..16a856a 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2046,7 +2046,7 @@ static void *migration_thread(void *opaque)
>  static bool migrate_return_path_create(MigrationState *s)
>  {
>      /* Whether we should enable return path */
> -    bool enable_return_path = false;
> +    bool enable_return_path = s->enable_return_path;

As you can see on my suggestion for this piece of code, just add the
()s->enable_return_path &&) to the right place on the call?

Thanks, Juan.
Peter Xu May 31, 2017, 7:38 a.m. UTC | #2
On Tue, May 30, 2017 at 05:59:10PM +0200, Juan Quintela wrote:
> Peter Xu <peterx@redhat.com> wrote:
> > Let this be a flag, default to on. Turn it off for <=2.9 versions.
> >
> > After this patch, return path will be on even for pre-copy migration as
> > long as the transport support, e.g., for socket typed transport
> > including "tcp|udp|unix" typed.
> >
> > This will naturally fix the bug mentioned below, when destination failed
> > on migration but source assumed it was successful - since now even for
> > precopy, source will wait for destination's MIG_RP_MSG_SHUT signal,
> > which will carry the final migration status of destination. Then, when
> > destination failed at any point of migration, source will know it, and
> > it'll resume the VM instead of a data lost.
> >
> > Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1439147
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  include/hw/compat.h           |  4 ++++
> >  include/migration/migration.h |  3 +++
> >  migration/migration.c         | 15 ++++++++++++++-
> >  3 files changed, 21 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/hw/compat.h b/include/hw/compat.h
> > index 55b1765..049457b 100644
> > --- a/include/hw/compat.h
> > +++ b/include/hw/compat.h
> > @@ -6,6 +6,10 @@
> >          .driver   = "pci-bridge",\
> >          .property = "shpc",\
> >          .value    = "off",\
> > +    },{\
> > +        .driver   = "migration",\
> > +        .property = "return-path",\
> > +        .value    = "off",\
> >      },
> >  
> >  #define HW_COMPAT_2_8 \
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index 70710de..e44119c 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h
> > @@ -169,6 +169,9 @@ typedef struct MigrationState {
> >      int64_t colo_checkpoint_time;
> >      QEMUTimer *colo_delay_timer;
> >  
> > +    /* Whether to try to enable return-path even for pre-copy */
> > +    bool enable_return_path;
> > +
> >      /* The last error that occurred */
> >      Error *error;
> >  } MigrationState ;
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 6df3483..16a856a 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -2046,7 +2046,7 @@ static void *migration_thread(void *opaque)
> >  static bool migrate_return_path_create(MigrationState *s)
> >  {
> >      /* Whether we should enable return path */
> > -    bool enable_return_path = false;
> > +    bool enable_return_path = s->enable_return_path;
> 
> As you can see on my suggestion for this piece of code, just add the
> ()s->enable_return_path &&) to the right place on the call?
> 
> Thanks, Juan.

Do you mean this?

    /*
     * Open the return path
     */
    if (migrate_postcopy_ram() || s->enable_return_path) {
        if (!migrate_return_path_create(s)) {
            migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
                              MIGRATION_STATUS_FAILED);
            migrate_fd_cleanup(s);
            return;
        }
    }

Here what I wanted to achieve is that:

a. for postcopy, we should try to enable return path, and it must
   succeed

b. for the case when enable_return_path is set, we try to enable return
   path, but even if it failed, we can still continue

Could we really achieve (b) if with above code? Or anything I missed?

Thanks,
Juan Quintela May 31, 2017, 7:43 a.m. UTC | #3
Peter Xu <peterx@redhat.com> wrote:
> On Tue, May 30, 2017 at 05:59:10PM +0200, Juan Quintela wrote:
>> Peter Xu <peterx@redhat.com> wrote:
>> > Let this be a flag, default to on. Turn it off for <=2.9 versions.
>> >
>> > After this patch, return path will be on even for pre-copy migration as
>> > long as the transport support, e.g., for socket typed transport
>> > including "tcp|udp|unix" typed.
>> >
>> > This will naturally fix the bug mentioned below, when destination failed
>> > on migration but source assumed it was successful - since now even for
>> > precopy, source will wait for destination's MIG_RP_MSG_SHUT signal,
>> > which will carry the final migration status of destination. Then, when
>> > destination failed at any point of migration, source will know it, and
>> > it'll resume the VM instead of a data lost.
>> >
>> > Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1439147
>> > Signed-off-by: Peter Xu <peterx@redhat.com>
>> > ---
>> >  include/hw/compat.h           |  4 ++++
>> >  include/migration/migration.h |  3 +++
>> >  migration/migration.c         | 15 ++++++++++++++-
>> >  3 files changed, 21 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/include/hw/compat.h b/include/hw/compat.h
>> > index 55b1765..049457b 100644
>> > --- a/include/hw/compat.h
>> > +++ b/include/hw/compat.h
>> > @@ -6,6 +6,10 @@
>> >          .driver   = "pci-bridge",\
>> >          .property = "shpc",\
>> >          .value    = "off",\
>> > +    },{\
>> > +        .driver   = "migration",\
>> > +        .property = "return-path",\
>> > +        .value    = "off",\
>> >      },
>> >  
>> >  #define HW_COMPAT_2_8 \
>> > diff --git a/include/migration/migration.h b/include/migration/migration.h
>> > index 70710de..e44119c 100644
>> > --- a/include/migration/migration.h
>> > +++ b/include/migration/migration.h
>> > @@ -169,6 +169,9 @@ typedef struct MigrationState {
>> >      int64_t colo_checkpoint_time;
>> >      QEMUTimer *colo_delay_timer;
>> >  
>> > +    /* Whether to try to enable return-path even for pre-copy */
>> > +    bool enable_return_path;
>> > +
>> >      /* The last error that occurred */
>> >      Error *error;
>> >  } MigrationState ;
>> > diff --git a/migration/migration.c b/migration/migration.c
>> > index 6df3483..16a856a 100644
>> > --- a/migration/migration.c
>> > +++ b/migration/migration.c
>> > @@ -2046,7 +2046,7 @@ static void *migration_thread(void *opaque)
>> >  static bool migrate_return_path_create(MigrationState *s)
>> >  {
>> >      /* Whether we should enable return path */
>> > -    bool enable_return_path = false;
>> > +    bool enable_return_path = s->enable_return_path;
>> 
>> As you can see on my suggestion for this piece of code, just add the
>> ()s->enable_return_path &&) to the right place on the call?
>> 
>> Thanks, Juan.
>
> Do you mean this?
>
>     /*
>      * Open the return path
>      */
>     if (migrate_postcopy_ram() || s->enable_return_path) {
>         if (!migrate_return_path_create(s)) {
>             migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
>                               MIGRATION_STATUS_FAILED);
>             migrate_fd_cleanup(s);
>             return;
>         }
>     }

Yeap.

> Here what I wanted to achieve is that:
>
> a. for postcopy, we should try to enable return path, and it must
>    succeed
>
> b. for the case when enable_return_path is set, we try to enable return
>    path, but even if it failed, we can still continue
>
> Could we really achieve (b) if with above code? Or anything I missed?

if we enable_return_path -> it should success, otherwise it makes no
sense, no?  We can try to remove the return path for some transports if
needed, but it makes no sense to enable a property that means:
"please, pretty please, enable it if you can"

if we are going to do it that way, then it is better to change the
property the other way around:

- disable_return_path: set for all old machine types

And not set for newer machine types, meaning that we just try.

What do you think?

Later, Juan.
Peter Xu May 31, 2017, 8:04 a.m. UTC | #4
On Wed, May 31, 2017 at 09:43:21AM +0200, Juan Quintela wrote:
> Peter Xu <peterx@redhat.com> wrote:
> > On Tue, May 30, 2017 at 05:59:10PM +0200, Juan Quintela wrote:
> >> Peter Xu <peterx@redhat.com> wrote:
> >> > Let this be a flag, default to on. Turn it off for <=2.9 versions.
> >> >
> >> > After this patch, return path will be on even for pre-copy migration as
> >> > long as the transport support, e.g., for socket typed transport
> >> > including "tcp|udp|unix" typed.
> >> >
> >> > This will naturally fix the bug mentioned below, when destination failed
> >> > on migration but source assumed it was successful - since now even for
> >> > precopy, source will wait for destination's MIG_RP_MSG_SHUT signal,
> >> > which will carry the final migration status of destination. Then, when
> >> > destination failed at any point of migration, source will know it, and
> >> > it'll resume the VM instead of a data lost.
> >> >
> >> > Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1439147
> >> > Signed-off-by: Peter Xu <peterx@redhat.com>
> >> > ---
> >> >  include/hw/compat.h           |  4 ++++
> >> >  include/migration/migration.h |  3 +++
> >> >  migration/migration.c         | 15 ++++++++++++++-
> >> >  3 files changed, 21 insertions(+), 1 deletion(-)
> >> >
> >> > diff --git a/include/hw/compat.h b/include/hw/compat.h
> >> > index 55b1765..049457b 100644
> >> > --- a/include/hw/compat.h
> >> > +++ b/include/hw/compat.h
> >> > @@ -6,6 +6,10 @@
> >> >          .driver   = "pci-bridge",\
> >> >          .property = "shpc",\
> >> >          .value    = "off",\
> >> > +    },{\
> >> > +        .driver   = "migration",\
> >> > +        .property = "return-path",\
> >> > +        .value    = "off",\
> >> >      },
> >> >  
> >> >  #define HW_COMPAT_2_8 \
> >> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> >> > index 70710de..e44119c 100644
> >> > --- a/include/migration/migration.h
> >> > +++ b/include/migration/migration.h
> >> > @@ -169,6 +169,9 @@ typedef struct MigrationState {
> >> >      int64_t colo_checkpoint_time;
> >> >      QEMUTimer *colo_delay_timer;
> >> >  
> >> > +    /* Whether to try to enable return-path even for pre-copy */
> >> > +    bool enable_return_path;
> >> > +
> >> >      /* The last error that occurred */
> >> >      Error *error;
> >> >  } MigrationState ;
> >> > diff --git a/migration/migration.c b/migration/migration.c
> >> > index 6df3483..16a856a 100644
> >> > --- a/migration/migration.c
> >> > +++ b/migration/migration.c
> >> > @@ -2046,7 +2046,7 @@ static void *migration_thread(void *opaque)
> >> >  static bool migrate_return_path_create(MigrationState *s)
> >> >  {
> >> >      /* Whether we should enable return path */
> >> > -    bool enable_return_path = false;
> >> > +    bool enable_return_path = s->enable_return_path;
> >> 
> >> As you can see on my suggestion for this piece of code, just add the
> >> ()s->enable_return_path &&) to the right place on the call?
> >> 
> >> Thanks, Juan.
> >
> > Do you mean this?
> >
> >     /*
> >      * Open the return path
> >      */
> >     if (migrate_postcopy_ram() || s->enable_return_path) {
> >         if (!migrate_return_path_create(s)) {
> >             migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
> >                               MIGRATION_STATUS_FAILED);
> >             migrate_fd_cleanup(s);
> >             return;
> >         }
> >     }
> 
> Yeap.
> 
> > Here what I wanted to achieve is that:
> >
> > a. for postcopy, we should try to enable return path, and it must
> >    succeed
> >
> > b. for the case when enable_return_path is set, we try to enable return
> >    path, but even if it failed, we can still continue
> >
> > Could we really achieve (b) if with above code? Or anything I missed?
> 
> if we enable_return_path -> it should success, otherwise it makes no
> sense, no?  We can try to remove the return path for some transports if
> needed, but it makes no sense to enable a property that means:
> "please, pretty please, enable it if you can"

(Indeed this is awkward... :)

> 
> if we are going to do it that way, then it is better to change the
> property the other way around:
> 
> - disable_return_path: set for all old machine types
> 
> And not set for newer machine types, meaning that we just try.
> 
> What do you think?

Both namings work for me.

The problem is that we cannot really force this as long as there is
any type of transports that does not support return path. E.g., when
migrating to an "exec:" typed transport with single-out IO stream. In
that case, if we fail the migration, it'll break the old behavior,
right? Then for exec: typed users they need to manually provide
enable_return_path=false to finally allow them to migrate.

(I think that's the most tricky point of this series...)

Thanks,
Juan Quintela May 31, 2017, 8:12 a.m. UTC | #5
Peter Xu <peterx@redhat.com> wrote:

...

>> > Here what I wanted to achieve is that:
>> >
>> > a. for postcopy, we should try to enable return path, and it must
>> >    succeed
>> >
>> > b. for the case when enable_return_path is set, we try to enable return
>> >    path, but even if it failed, we can still continue
>> >
>> > Could we really achieve (b) if with above code? Or anything I missed?
>> 
>> if we enable_return_path -> it should success, otherwise it makes no
>> sense, no?  We can try to remove the return path for some transports if
>> needed, but it makes no sense to enable a property that means:
>> "please, pretty please, enable it if you can"
>
> (Indeed this is awkward... :)
>
>> 
>> if we are going to do it that way, then it is better to change the
>> property the other way around:
>> 
>> - disable_return_path: set for all old machine types
>> 
>> And not set for newer machine types, meaning that we just try.
>> 
>> What do you think?
>
> Both namings work for me.
>
> The problem is that we cannot really force this as long as there is
> any type of transports that does not support return path. E.g., when
> migrating to an "exec:" typed transport with single-out IO stream. In
> that case, if we fail the migration, it'll break the old behavior,
> right? Then for exec: typed users they need to manually provide
> enable_return_path=false to finally allow them to migrate.
>
> (I think that's the most tricky point of this series...)

Then do what I said.  Disable it for old machine types.
And for new machine types, we just try our best.  What I object is
having a property with a meaning of "perhaps enable return path".

And yes, representing with a boolean a tri-state is tricky O:-)

Later, Juan.
diff mbox

Patch

diff --git a/include/hw/compat.h b/include/hw/compat.h
index 55b1765..049457b 100644
--- a/include/hw/compat.h
+++ b/include/hw/compat.h
@@ -6,6 +6,10 @@ 
         .driver   = "pci-bridge",\
         .property = "shpc",\
         .value    = "off",\
+    },{\
+        .driver   = "migration",\
+        .property = "return-path",\
+        .value    = "off",\
     },
 
 #define HW_COMPAT_2_8 \
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 70710de..e44119c 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -169,6 +169,9 @@  typedef struct MigrationState {
     int64_t colo_checkpoint_time;
     QEMUTimer *colo_delay_timer;
 
+    /* Whether to try to enable return-path even for pre-copy */
+    bool enable_return_path;
+
     /* The last error that occurred */
     Error *error;
 } MigrationState ;
diff --git a/migration/migration.c b/migration/migration.c
index 6df3483..16a856a 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2046,7 +2046,7 @@  static void *migration_thread(void *opaque)
 static bool migrate_return_path_create(MigrationState *s)
 {
     /* Whether we should enable return path */
-    bool enable_return_path = false;
+    bool enable_return_path = s->enable_return_path;
     /* Whether we should force its success */
     bool force_return_path = false;
 
@@ -2114,9 +2114,22 @@  static void migration_instance_init(Object *obj)
     ms->parameters.tls_hostname = g_strdup("");
 }
 
+static Property migration_properties[] = {
+    DEFINE_PROP_BOOL("return-path", MigrationState, enable_return_path, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void migration_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->props = migration_properties;
+}
+
 static const TypeInfo migration_type = {
     .name = TYPE_MIGRATION,
     .parent = TYPE_DEVICE,
+    .class_init = migration_class_init,
     .class_size = sizeof(MigrationClass),
     .instance_size = sizeof(MigrationState),
     .instance_init = migration_instance_init,