diff mbox series

[RFC,2/2] migration: abort on destination if switchover limit exceeded

Message ID 20240621143221.198784-3-elena.ufimtseva@oracle.com
State New
Headers show
Series migration: introduce strict SLA | expand

Commit Message

Elena Ufimtseva June 21, 2024, 2:32 p.m. UTC
During live migration, receive current downtime from source
and start a downtime timer. When the destination dowtime
and added source downtime exceeds downtime limit for more
than switchover limit, abort live migration on destination.

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
---
 migration/migration.c  |  2 ++
 migration/migration.h  | 15 ++++++++++
 migration/savevm.c     | 68 ++++++++++++++++++++++++++++++++++++++++++
 migration/savevm.h     |  2 ++
 migration/trace-events |  3 ++
 5 files changed, 90 insertions(+)

Comments

Peter Xu June 24, 2024, 7:41 p.m. UTC | #1
On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote:
> During live migration, receive current downtime from source
> and start a downtime timer. When the destination dowtime
> and added source downtime exceeds downtime limit for more
> than switchover limit, abort live migration on destination.
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> ---
>  migration/migration.c  |  2 ++
>  migration/migration.h  | 15 ++++++++++
>  migration/savevm.c     | 68 ++++++++++++++++++++++++++++++++++++++++++
>  migration/savevm.h     |  2 ++
>  migration/trace-events |  3 ++
>  5 files changed, 90 insertions(+)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 5cc304d2db..64d7290997 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -240,6 +240,8 @@ void migration_object_init(void)
>      current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
>  
>      current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
> +    /* Downtime will start when source sends its current downtime. */
> +    current_incoming->downtime_start = 0;
>  
>      migration_object_check(current_migration, &error_fatal);
>  
> diff --git a/migration/migration.h b/migration/migration.h
> index aa56b70795..06f4ebe214 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -230,6 +230,21 @@ struct MigrationIncomingState {
>  
>      /* Do exit on incoming migration failure */
>      bool exit_on_error;
> +
> +    /* Initial downtime on destination set by MIG_CMD_SEND_SRC_DOWNTIME */
> +    uint64_t downtime_start;
> +    /*
> +     * Current donwtime on destination that initially set equal to source by
> +     * MIG_CMD_SEND_SRC_DOWNTIME, then updated by destination itself.
> +     */
> +    uint64_t downtime_now;

Is this needed?

> +    /*
> +     * Abort live migration on destination when current destination downtime
> +     * exceeds the abort_limit. abort_limit is being set by
> +     * MIG_CMD_SEND_SRC_DOWNTIME sent from source.
> +     */
> +    uint64_t abort_limit;

If we assume both QEMUs will be applied with the same cap/params, we may
not need this and we can directly read the parameter on dest.

> +    uint64_t src_downtime;
>  };
>  
>  MigrationIncomingState *migration_incoming_get_current(void);
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 031ab03915..f3b5ea98bf 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -90,6 +90,7 @@ enum qemu_vm_cmd {
>      MIG_CMD_ENABLE_COLO,       /* Enable COLO */
>      MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
>      MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
> +    MIG_CMD_SEND_SRC_DOWNTIME,    /* Send current downtime to dst */
>      MIG_CMD_MAX
>  };
>  
> @@ -109,6 +110,7 @@ static struct mig_cmd_args {
>      [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
>      [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
>      [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
> +    [MIG_CMD_SEND_SRC_DOWNTIME] = { .len = -1, .name = "SEND_SRC_DOWNTIME" },
>      [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
>  };
>  
> @@ -1218,6 +1220,18 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
>      qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
>  }
>  
> +void qemu_savevm_send_downtime(QEMUFile *f, int64_t abort_limit_ms,
> +                               int64_t source_downtime)
> +{
> +    uint64_t tmp[2];
> +    tmp[0] = cpu_to_be64(abort_limit_ms);
> +    tmp[1] = cpu_to_be64(source_downtime);
> +
> +    trace_qemu_savevm_send_downtime(abort_limit_ms, source_downtime);
> +    qemu_savevm_command_send(f, MIG_CMD_SEND_SRC_DOWNTIME,
> +                             16, (uint8_t *)tmp);
> +}
> +
>  bool qemu_savevm_state_blocked(Error **errp)
>  {
>      SaveStateEntry *se;
> @@ -1635,6 +1649,14 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
>          }
>      }
>  
> +    if (migrate_switchover_abort()) {
> +        MigrationState *s = migrate_get_current();
> +        uint64_t abort_limit_ms =
> +            s->parameters.downtime_limit + s->parameters.switchover_limit;
> +        qemu_savevm_send_downtime(f, abort_limit_ms,
> +                                  migration_get_current_downtime(s));
> +    }
> +
>      if (iterable_only) {
>          goto flush;
>      }
> @@ -1919,6 +1941,20 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
>      return 0;
>  }
>  
> +static int loadvm_handle_src_downtime(MigrationIncomingState *mis,
> +                                      uint16_t len)
> +{
> +    uint64_t src_abort_limit = qemu_get_be64(mis->from_src_file);
> +    uint64_t src_current_downtime = qemu_get_be64(mis->from_src_file);
> +
> +    mis->abort_limit = src_abort_limit;
> +    mis->src_downtime = src_current_downtime;
> +    mis->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);

I guess this didn't count in the delay of sending this packet.  Since the
whole point of this feature will be "making sure downtime is less than xxx
or cancel migration", I think this might cause the real downtime still more
than expected.  Not sure how big a concern this is.

E.g., have you measured how a packet can be delayed when the socket
pipeline is mostly full, then queue this SRC_DOWNTIME message after all
that?  I think that's very possible the case here at switchover where src
is trying to dump as fast as possible.  I'm not sure whether it's easy to
measure, either..

> +
> +    trace_loadvm_handle_src_downtime(src_abort_limit, src_current_downtime);
> +    return 0;
> +}
> +
>  /* After postcopy we will be told to throw some pages away since they're
>   * dirty and will have to be demand fetched.  Must happen before CPU is
>   * started.
> @@ -2540,6 +2576,9 @@ static int loadvm_process_command(QEMUFile *f)
>  
>      case MIG_CMD_ENABLE_COLO:
>          return loadvm_process_enable_colo(mis);
> +
> +    case MIG_CMD_SEND_SRC_DOWNTIME:
> +        return loadvm_handle_src_downtime(mis, len);
>      }
>  
>      return 0;
> @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,
>          trace_vmstate_downtime_load("non-iterable", se->idstr,
>                                      se->instance_id, end_ts - start_ts);
>      }
> +    if (migrate_switchover_abort() && type == QEMU_VM_SECTION_FULL &&
> +        mis->downtime_start) {
> +        mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +        uint64_t dst_downtime = mis->downtime_now - mis->downtime_start;
> +        if (mis->src_downtime + dst_downtime >= mis->abort_limit) {
> +            error_report("Shutdown destination migration, migration abort_limit"
> +                         "(%lu ms) was reached.", mis->abort_limit);
> +            trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime,
> +                                             mis->src_downtime);
> +            return -EINVAL;
> +        }
> +    }
>  
>      if (!check_section_footer(f, se)) {
>          return -EINVAL;
> @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,
>                                      se->instance_id, end_ts - start_ts);
>      }
>  
> +    if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END &&
> +        mis->downtime_start) {
> +        mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +        uint64_t dst_downtime = mis->downtime_now - mis->downtime_start;
> +        if (mis->src_downtime + dst_downtime >= mis->abort_limit) {
> +            error_report("Shutdown destination migration, migration abort_limit (%lu ms)"
> +                          "was reached.", mis->abort_limit);
> +            trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime,
> +                                             mis->src_downtime);
> +            return -EINVAL;
> +        }
> +    }

So this traps both iteration and non-iteration phase.  What if the downtime
was caused by after these, or unluckily at the last non-iterator device
state?

After trying to think about it, I figured this is not easy to do right.
Also, I start to doubt whether it's definitely a good idea on having this
in the first place.

Firstly, I'm wondering how we should treat this new feature
v.s. downtime_limit.  It's about how the user should set both.

If this is about "cancel migration if downtime more than xxx",
then.. AFAICT that's exactly what downtime_limit was "designed" to be..
It's just that downtime_limit says the other way round, as: "don't
switchover if the downtime will be more than xxx".

Then, if the user has option to set both these parameters, what would be
the right thing to do?  Shouldn't they simply always set both parameters to
be the same value already?  But if so, what's the point on having two?

This caused me to think whether the 2nd parameter is needed at all, instead
whether we should simply make downtime_limit more accurate, so that it will
start to account more things than before.  It won't be accurate, but the
same case here: making this new feature "accurate" can also be very hard.

Thanks,
Joao Martins June 25, 2024, 11:38 a.m. UTC | #2
On 24/06/2024 20:41, Peter Xu wrote:
> On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote:
>> @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,
 >>      if (!check_section_footer(f, se)) {
>>          return -EINVAL;
>> @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,
>>                                      se->instance_id, end_ts - start_ts);
>>      }
>>  
>> +    if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END &&
>> +        mis->downtime_start) {
>> +        mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>> +        uint64_t dst_downtime = mis->downtime_now - mis->downtime_start;
>> +        if (mis->src_downtime + dst_downtime >= mis->abort_limit) {
>> +            error_report("Shutdown destination migration, migration abort_limit (%lu ms)"
>> +                          "was reached.", mis->abort_limit);
>> +            trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime,
>> +                                             mis->src_downtime);
>> +            return -EINVAL;
>> +        }
>> +    }
> 
> So this traps both iteration and non-iteration phase.  What if the downtime
> was caused by after these, or unluckily at the last non-iterator device
> state?
> 
> After trying to think about it, I figured this is not easy to do right.
> Also, I start to doubt whether it's definitely a good idea on having this
> in the first place.
> 
> Firstly, I'm wondering how we should treat this new feature
> v.s. downtime_limit.  It's about how the user should set both.
> 
> If this is about "cancel migration if downtime more than xxx",
> then.. AFAICT that's exactly what downtime_limit was "designed" to be..
> It's just that downtime_limit says the other way round, as: "don't
> switchover if the downtime will be more than xxx".
> 
> Then, if the user has option to set both these parameters, what would be
> the right thing to do?  Shouldn't they simply always set both parameters to
> be the same value already?  But if so, what's the point on having two?
> 
> This caused me to think whether the 2nd parameter is needed at all, instead
> whether we should simply make downtime_limit more accurate, so that it will
> start to account more things than before.  It won't be accurate, but the
> same case here: making this new feature "accurate" can also be very hard.
> 

The way I think about it is that current downtime-limit captures nicely the data
part as the only calculations it cares about is how much outstanding data it
sends to the destination (be it VF device state or RAM). This second parameter
captures what is *not* related to data, i.e. costs of hypervisor quiescing the
VM or added latencies in hypervisor *in addition* to sending outstanding data to
destination.

If we were to merge this all into a single parameter (aka downtime-limit) we are
possibility artificially increasing the downtime thanks to relaxing the
oustanding data part, as opposed to trying to capture the switchover cost (hence
the name the parameter) that can't be reflected in the former equation.

So I feel like this needs two parameters whereby this second new parameter
covers 'switchover cost' (hence the name) with current migration algorithm.

	Joao
Peter Xu June 25, 2024, 2:53 p.m. UTC | #3
On Tue, Jun 25, 2024 at 12:38:50PM +0100, Joao Martins wrote:
> On 24/06/2024 20:41, Peter Xu wrote:
> > On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote:
> >> @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,
>  >>      if (!check_section_footer(f, se)) {
> >>          return -EINVAL;
> >> @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,
> >>                                      se->instance_id, end_ts - start_ts);
> >>      }
> >>  
> >> +    if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END &&
> >> +        mis->downtime_start) {
> >> +        mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> >> +        uint64_t dst_downtime = mis->downtime_now - mis->downtime_start;
> >> +        if (mis->src_downtime + dst_downtime >= mis->abort_limit) {
> >> +            error_report("Shutdown destination migration, migration abort_limit (%lu ms)"
> >> +                          "was reached.", mis->abort_limit);
> >> +            trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime,
> >> +                                             mis->src_downtime);
> >> +            return -EINVAL;
> >> +        }
> >> +    }
> > 
> > So this traps both iteration and non-iteration phase.  What if the downtime
> > was caused by after these, or unluckily at the last non-iterator device
> > state?
> > 
> > After trying to think about it, I figured this is not easy to do right.
> > Also, I start to doubt whether it's definitely a good idea on having this
> > in the first place.
> > 
> > Firstly, I'm wondering how we should treat this new feature
> > v.s. downtime_limit.  It's about how the user should set both.
> > 
> > If this is about "cancel migration if downtime more than xxx",
> > then.. AFAICT that's exactly what downtime_limit was "designed" to be..
> > It's just that downtime_limit says the other way round, as: "don't
> > switchover if the downtime will be more than xxx".
> > 
> > Then, if the user has option to set both these parameters, what would be
> > the right thing to do?  Shouldn't they simply always set both parameters to
> > be the same value already?  But if so, what's the point on having two?
> > 
> > This caused me to think whether the 2nd parameter is needed at all, instead
> > whether we should simply make downtime_limit more accurate, so that it will
> > start to account more things than before.  It won't be accurate, but the
> > same case here: making this new feature "accurate" can also be very hard.
> > 
> 
> The way I think about it is that current downtime-limit captures nicely the data
> part as the only calculations it cares about is how much outstanding data it
> sends to the destination (be it VF device state or RAM). This second parameter
> captures what is *not* related to data, i.e. costs of hypervisor quiescing the
> VM or added latencies in hypervisor *in addition* to sending outstanding data to
> destination.
> 
> If we were to merge this all into a single parameter (aka downtime-limit) we are
> possibility artificially increasing the downtime thanks to relaxing the
> oustanding data part, as opposed to trying to capture the switchover cost (hence
> the name the parameter) that can't be reflected in the former equation.
> 
> So I feel like this needs two parameters whereby this second new parameter
> covers 'switchover cost' (hence the name) with current migration algorithm.

Then the question is how should we suggest the user to specify these two
parameters.

The cover letter used:

  migrate_set_parameter downtime-limit 300
  migrate_set_parameter switchover-limit 10

But it does look like a testing sample only and not valid in real world
setup, as logically the new limit should be larger than the old one,
afaict.  If the new limit is larger, how larger should it be?

So far I can understand how it works intenrally, but even with that I don't
know how I should set this parameter, e.g., when downtime_limit used to be
300ms, I'd say 500ms could be a suitable value, but is it?  In that case,
perhaps the 500ms is the "real" limit that I don't want a VM to be halted
for more than 500ms, but in that case the user should have already setup
downtime_limit to 500ms.

I also don't know how should an user understand all these details and
figure out how to set these.  Note that currently we definitely allow the
user to specify downtime_limit and it's so far understandable, even though
the user may assume that contains the whole blackout phase (that's also
what we wish it can achieve..), rather than the fact that it only takes
ram/vfio/... reported remaining data into account v.s. the bandwidth.

Maybe we could consider introducing the parameters/caps in some other form,
so that we can keep the downtime_limit to be "the total downtime", instead
of a new definition of downtime.  E.g., it's at least not fair indeed to
take the whole "downtime_limit" for data transfers, so maybe some ratio
parameter can be introduced to say how much portion of that downtime can be
used for data transfer, and then it might be ok to work with the new cap
introduced so that when total downtime is over the limit it will abort the
migration (but without a new "max downtime" definition which might be
confusing).

Said that, I also wonder whether you have thought about improving
downtime_limit itself.  It'll be definitely better if we simply don't
switchover at all if that won't make it.  IOW, I wonder whether there can
be case where user specifies these parameters, migration simply keeps
switching over and keep aborting, requiring a retry.  That's pretty
unwanted.  We may simply doesn't allow that switchover to happen at all.

Thanks,
Joao Martins June 25, 2024, 4:31 p.m. UTC | #4
On 25/06/2024 15:53, Peter Xu wrote:
> On Tue, Jun 25, 2024 at 12:38:50PM +0100, Joao Martins wrote:
>> On 24/06/2024 20:41, Peter Xu wrote:
>>> On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote:
>>>> @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,
>>  >>      if (!check_section_footer(f, se)) {
>>>>          return -EINVAL;
>>>> @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,
>>>>                                      se->instance_id, end_ts - start_ts);
>>>>      }
>>>>  
>>>> +    if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END &&
>>>> +        mis->downtime_start) {
>>>> +        mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>>>> +        uint64_t dst_downtime = mis->downtime_now - mis->downtime_start;
>>>> +        if (mis->src_downtime + dst_downtime >= mis->abort_limit) {
>>>> +            error_report("Shutdown destination migration, migration abort_limit (%lu ms)"
>>>> +                          "was reached.", mis->abort_limit);
>>>> +            trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime,
>>>> +                                             mis->src_downtime);
>>>> +            return -EINVAL;
>>>> +        }
>>>> +    }
>>>
>>> So this traps both iteration and non-iteration phase.  What if the downtime
>>> was caused by after these, or unluckily at the last non-iterator device
>>> state?
>>>
>>> After trying to think about it, I figured this is not easy to do right.
>>> Also, I start to doubt whether it's definitely a good idea on having this
>>> in the first place.
>>>
>>> Firstly, I'm wondering how we should treat this new feature
>>> v.s. downtime_limit.  It's about how the user should set both.
>>>
>>> If this is about "cancel migration if downtime more than xxx",
>>> then.. AFAICT that's exactly what downtime_limit was "designed" to be..
>>> It's just that downtime_limit says the other way round, as: "don't
>>> switchover if the downtime will be more than xxx".
>>>
>>> Then, if the user has option to set both these parameters, what would be
>>> the right thing to do?  Shouldn't they simply always set both parameters to
>>> be the same value already?  But if so, what's the point on having two?
>>>
>>> This caused me to think whether the 2nd parameter is needed at all, instead
>>> whether we should simply make downtime_limit more accurate, so that it will
>>> start to account more things than before.  It won't be accurate, but the
>>> same case here: making this new feature "accurate" can also be very hard.
>>>
>>
>> The way I think about it is that current downtime-limit captures nicely the data
>> part as the only calculations it cares about is how much outstanding data it
>> sends to the destination (be it VF device state or RAM). This second parameter
>> captures what is *not* related to data, i.e. costs of hypervisor quiescing the
>> VM or added latencies in hypervisor *in addition* to sending outstanding data to
>> destination.
>>
>> If we were to merge this all into a single parameter (aka downtime-limit) we are
>> possibility artificially increasing the downtime thanks to relaxing the
>> oustanding data part, as opposed to trying to capture the switchover cost (hence
>> the name the parameter) that can't be reflected in the former equation.
>>
>> So I feel like this needs two parameters whereby this second new parameter
>> covers 'switchover cost' (hence the name) with current migration algorithm.
> 
> Then the question is how should we suggest the user to specify these two
> parameters.
> 
> The cover letter used:
> 
>   migrate_set_parameter downtime-limit 300
>   migrate_set_parameter switchover-limit 10
> 
> But it does look like a testing sample only and not valid in real world
> setup, as logically the new limit should be larger than the old one,
> afaict.  If the new limit is larger, how larger should it be?
> 
> So far I can understand how it works intenrally, but even with that I don't
> know how I should set this parameter, e.g., when downtime_limit used to be
> 300ms, I'd say 500ms could be a suitable value, but is it?  In that case,
> perhaps the 500ms is the "real" limit that I don't want a VM to be halted
> for more than 500ms, but in that case the user should have already setup
> downtime_limit to 500ms.
> 

Yeap -- I think you're right that it presents a UX confusion on what should a
user should set.

> I also don't know how should an user understand all these details and
> figure out how to set these.  Note that currently we definitely allow the
> user to specify downtime_limit and it's so far understandable, even though
> the user may assume that contains the whole blackout phase (that's also
> what we wish it can achieve..), rather than the fact that it only takes
> ram/vfio/... reported remaining data into account v.s. the bandwidth.
> 
> Maybe we could consider introducing the parameters/caps in some other form,
> so that we can keep the downtime_limit to be "the total downtime", instead
> of a new definition of downtime.  E.g., it's at least not fair indeed to
> take the whole "downtime_limit" for data transfers, so maybe some ratio
> parameter can be introduced to say how much portion of that downtime can be
> used for data transfer 

I like this idea -- it fixes another problem that downtime-limit (solely) makes
the algorithm be too optimistic and just utilize the whole bandwidth (e.g.
migrating after the first dirty sync depending on downtime-limit)

e.g. "iterable-downtime" (or "precopy-downtime" for lack of a better idea) for
such proerty

Then when we set downtime-limit it can represent the whole thing including
blackout as it is the case today and we configure it by minimizing
iterable-downtime to give enough headroom for switchover.

>, and then it might be ok to work with the new cap
> introduced so that when total downtime is over the limit it will abort the
> migration (but without a new "max downtime" definition which might be
> confusing).
> 

Yes, improving 'downtime-limit' with a sub parameter above for RAM/state
downtime, this migration capability then becomes a boolean instead of a value
where it's more like 'downtime-limit-strict' and going above downtime limit is
enforced/aborted is also performed like these patches. To prevent essentially this:

> IOW, I wonder whether there can
> be case where user specifies these parameters, migration simply keeps
> switching over and keep aborting, requiring a retry.  That's pretty
> unwanted.  We may simply doesn't allow that switchover to happen at all.

(...)

> Said that, I also wonder whether you have thought about improving
> downtime_limit itself.  It'll be definitely better if we simply don't
> switchover at all if that won't make it.  

The device-state multifd scaling is a take on improving switchover phase, and we
will keep improving it whenever we find things... but the switchover itself
can't be 'precomputed' into a downtime number equation ahead of time to
encompass all possible latencies/costs. Part of the reason that at least we
couldn't think of a way besides this proposal here, which at the core it's meant
to bounds check switchover. Even without taking into account VFs/HW[0], it is
simply not considered how long it might take and giving some sort of downtime
buffer coupled with enforcement that can be enforced helps not violating
migration SLAs.

[0]
https://lore.kernel.org/qemu-devel/20230317081904.24389-1-xuchuangxclwt@bytedance.com/
Daniel P. Berrangé June 25, 2024, 6:37 p.m. UTC | #5
On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote:
> Then the question is how should we suggest the user to specify these two
> parameters.
> 
> The cover letter used:
> 
>   migrate_set_parameter downtime-limit 300
>   migrate_set_parameter switchover-limit 10

What this means is that in practice the total downtime limit
is 310 ms, however, expressing this as two parameters is
incredibly inflexible.

If the actual RAM transfer downtime only took 50 ms, then why
should the switchover downtime still be limited to 10ms, when
we've still got a budget of 250 ms that was unused.

IOW, if my VM tolerates a downtime of 310ms, then I want that
310ms spread across the RAM transfer downtime and switchover
downtime in *any* ratio. ALl that matters is the overall
completion time.

IMHO exposing 2 distinct parameters is horrible from a
usability POV.


With regards,
Daniel
Peter Xu June 25, 2024, 7:03 p.m. UTC | #6
On Tue, Jun 25, 2024 at 05:31:19PM +0100, Joao Martins wrote:
> The device-state multifd scaling is a take on improving switchover phase,
> and we will keep improving it whenever we find things... but the

That'll be helpful, thanks.  Just a quick note that "reducing downtime" is
a separate issue comparing to "make downtime_limit accurate".

> switchover itself can't be 'precomputed' into a downtime number equation
> ahead of time to encompass all possible latencies/costs. Part of the
> reason that at least we couldn't think of a way besides this proposal
> here, which at the core it's meant to bounds check switchover. Even
> without taking into account VFs/HW[0], it is simply not considered how
> long it might take and giving some sort of downtime buffer coupled with
> enforcement that can be enforced helps not violating migration SLAs.

I agree such enforcement alone can be useful in general to be able to
fallback.  Said that, I think it would definitely be nice to attach more
information on the downtime analysis when reposting this series, if there
is any.

For example, irrelevant of whether QEMU can do proper predictions at all,
there can be data / results to show what is the major parts that are
missing besides the current calculations, aka an expectation on when the
fallback can trigger, and some justification on why they can't be
predicted.

IMHO the enforcement won't make much sense if it keeps triggering, in that
case people will simply not use it as it stops migrations from happening.
Ultimately the work will still be needed to make downtime_limit accurate.
The fallback should only be an last fence to guard the promise which should
be the "corner cases".
Daniel P. Berrangé June 25, 2024, 7:12 p.m. UTC | #7
On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote:
> During live migration, receive current downtime from source
> and start a downtime timer. When the destination dowtime
> and added source downtime exceeds downtime limit for more
> than switchover limit, abort live migration on destination.
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> ---
>  migration/migration.c  |  2 ++
>  migration/migration.h  | 15 ++++++++++
>  migration/savevm.c     | 68 ++++++++++++++++++++++++++++++++++++++++++
>  migration/savevm.h     |  2 ++
>  migration/trace-events |  3 ++
>  5 files changed, 90 insertions(+)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 5cc304d2db..64d7290997 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -240,6 +240,8 @@ void migration_object_init(void)
>      current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
>  
>      current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
> +    /* Downtime will start when source sends its current downtime. */
> +    current_incoming->downtime_start = 0;
>  
>      migration_object_check(current_migration, &error_fatal);
>  
> diff --git a/migration/migration.h b/migration/migration.h
> index aa56b70795..06f4ebe214 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -230,6 +230,21 @@ struct MigrationIncomingState {
>  
>      /* Do exit on incoming migration failure */
>      bool exit_on_error;
> +
> +    /* Initial downtime on destination set by MIG_CMD_SEND_SRC_DOWNTIME */
> +    uint64_t downtime_start;
> +    /*
> +     * Current donwtime on destination that initially set equal to source by
> +     * MIG_CMD_SEND_SRC_DOWNTIME, then updated by destination itself.
> +     */
> +    uint64_t downtime_now;
> +    /*
> +     * Abort live migration on destination when current destination downtime
> +     * exceeds the abort_limit. abort_limit is being set by
> +     * MIG_CMD_SEND_SRC_DOWNTIME sent from source.
> +     */
> +    uint64_t abort_limit;
> +    uint64_t src_downtime;
>  };
>  
>  MigrationIncomingState *migration_incoming_get_current(void);
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 031ab03915..f3b5ea98bf 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -90,6 +90,7 @@ enum qemu_vm_cmd {
>      MIG_CMD_ENABLE_COLO,       /* Enable COLO */
>      MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
>      MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
> +    MIG_CMD_SEND_SRC_DOWNTIME,    /* Send current downtime to dst */
>      MIG_CMD_MAX
>  };
>  
> @@ -109,6 +110,7 @@ static struct mig_cmd_args {
>      [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
>      [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
>      [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
> +    [MIG_CMD_SEND_SRC_DOWNTIME] = { .len = -1, .name = "SEND_SRC_DOWNTIME" },
>      [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
>  };
>  
> @@ -1218,6 +1220,18 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
>      qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
>  }
>  
> +void qemu_savevm_send_downtime(QEMUFile *f, int64_t abort_limit_ms,
> +                               int64_t source_downtime)
> +{
> +    uint64_t tmp[2];
> +    tmp[0] = cpu_to_be64(abort_limit_ms);
> +    tmp[1] = cpu_to_be64(source_downtime);
> +
> +    trace_qemu_savevm_send_downtime(abort_limit_ms, source_downtime);
> +    qemu_savevm_command_send(f, MIG_CMD_SEND_SRC_DOWNTIME,
> +                             16, (uint8_t *)tmp);
> +}

Is having both the src and dst responsible to tracking downtime limit
introducing extra failure points ?

We know the src QEMU is reliably operating, but we don't know that the
dst QEMU will do so. If we give the dst responsibility for checking its
own downtime then that perhaps isn't robust aganist the dst deadlocking
or live-locking itself.

Is it better have the src exclusively responsible for tracking the
downtime limit, and have it send an "abort" message to the dst when
the limit is exceeded ? This process would require an explicit ack
from the src before the dst is permited to start its CPUs too.


With regards,
Daniel
Joao Martins June 26, 2024, 11:04 a.m. UTC | #8
On 25/06/2024 20:03, Peter Xu wrote:
> On Tue, Jun 25, 2024 at 05:31:19PM +0100, Joao Martins wrote:
>> The device-state multifd scaling is a take on improving switchover phase,
>> and we will keep improving it whenever we find things... but the
> 
> That'll be helpful, thanks.  Just a quick note that "reducing downtime" is
> a separate issue comparing to "make downtime_limit accurate".
> 
I see those two separately too; it's just that right now that's the only work I
know we can make it better is decreasing/optimizing it (other lines of work
doing similar stuff inside vDPA too, by Si-Wei for example). Making
downtime_limit accurate not so sure what it entails right now from your PoV. But
it depends on what this question really was about, see at the end in case I am
understanding you correctly.

>> switchover itself can't be 'precomputed' into a downtime number equation
>> ahead of time to encompass all possible latencies/costs. Part of the
>> reason that at least we couldn't think of a way besides this proposal
>> here, which at the core it's meant to bounds check switchover. Even
>> without taking into account VFs/HW[0], it is simply not considered how
>> long it might take and giving some sort of downtime buffer coupled with
>> enforcement that can be enforced helps not violating migration SLAs.
> 
> I agree such enforcement alone can be useful in general to be able to
> fallback.  Said that, I think it would definitely be nice to attach more
> information on the downtime analysis when reposting this series, if there
> is any.
> 
> For example, irrelevant of whether QEMU can do proper predictions at all,
> there can be data / results to show what is the major parts that are
> missing besides the current calculations, aka an expectation on when the
> fallback can trigger, and some justification on why they can't be
> predicted.
> 

/me nods -- I think this might be a gap in the current cover letter.

I recall we have looked at quite a few downtime traces (thanks to the tracing
improvements made in the last dev cycle!), but it's also easy to reproduce these
problems with downtime-limit even without past data with relatively simple configs.

> IMHO the enforcement won't make much sense if it keeps triggering, in that
> case people will simply not use it as it stops migrations from happening.

Right -- The enforcement *alone* damages more than it fixes. Meaning enforcing
without having some way to give some headroom within downtime-limit for
switchover to be accounted. The latter is what allows the enforcement to be
placed, otherwise we would just be failing migrations left and right.

> Ultimately the work will still be needed to make downtime_limit accurate.
> The fallback should only be an last fence to guard the promise which should
> be the "corner cases".
> 

Are you thinking in something specifically?

Many "variables" affect this from the point we decide switchover, and at the
worst (likely) case it means having qemu subsystems declare empirical values on
how long it takes to suspend/resume/transfer-state to migration expected
downtime prediction equation. Part of the reason that having headroom within
downtime-limit was a simple 'catch-all' (from our PoV) in terms of
maintainability while giving user something to fallback for characterizing its
SLA. Personally, I think there's a tiny bit disconnect between what the user
desires when setting downtime-limit vs what it really does. downtime-limit right
now looks to be best viewed as 'precopy-ram-downtime-limit' :)

Unless the accuracy work you're thinking is just having a better migration
algorithm at obtaining the best possible downtime for outstanding-data/RAM *even
if* downtime-limit is set at a high limit, like giving 1) a grace period in the
beginning of migration post first dirty sync or 2) a measured value with
continually incrementing target downtime limit until max downtime-limit set by
user hits ... before defaulting to the current behaviour of migrating as soon as
expected downtime is within the downtime-limit. As discussed in the last
response, this could create the 'downtime headroom' for getting the
enforcement/SLA better honored. Is this maybe your line of thinking?
Joao Martins June 26, 2024, 11:29 a.m. UTC | #9
On 25/06/2024 19:37, Daniel P. Berrangé wrote:
> On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote:
>> Then the question is how should we suggest the user to specify these two
>> parameters.
>>
>> The cover letter used:
>>
>>   migrate_set_parameter downtime-limit 300
>>   migrate_set_parameter switchover-limit 10
> 
> What this means is that in practice the total downtime limit
> is 310 ms, however, expressing this as two parameters is
> incredibly inflexible.
> 
> If the actual RAM transfer downtime only took 50 ms, then why
> should the switchover downtime still be limited to 10ms, when
> we've still got a budget of 250 ms that was unused.
> 

The downtime limit is 300, it's more than you are giving something *extra* 10ms
when you switchover regardless of where that's spent.

If it makes it easier to understand you could see this parameter as:

'downtime-limit-max-error' = 10 ms

The name as proposed by the RFC was meant to honor what the error margin was
meant for: to account for extra time during switchover. Adding this inside
downtime-limit wouldn't work as it otherwise would be used solely for RAM
transfer during precopy.

> IOW, if my VM tolerates a downtime of 310ms, then I want that
> 310ms spread across the RAM transfer downtime and switchover
> downtime in *any* ratio. ALl that matters is the overall
> completion time.
> 
That still happens with this patches, no specific budget is given to each.
Though implicitly if downtime-limit captures only RAM transfer, then in theory
if you're migrating a busy guest that happens to meet the SLA say
expected-downtime=290, then you have a total of 20 for switchover (thanks to the
extra 10 used in switchover-limit/downtime-limit-max-error 10).

But keep in mind that the migration prediction *does not* account for anything
other than RAM transfer. It happens that maybe your configuration is cheap, or
has been optimized enough over the years that you likely don't care or noticed
that it /could/ hurt the user designated SLA even if by a little.

Regards,
	Joao
Daniel P. Berrangé June 26, 2024, 11:34 a.m. UTC | #10
On Wed, Jun 26, 2024 at 12:29:41PM +0100, Joao Martins wrote:
> On 25/06/2024 19:37, Daniel P. Berrangé wrote:
> > On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote:
> >> Then the question is how should we suggest the user to specify these two
> >> parameters.
> >>
> >> The cover letter used:
> >>
> >>   migrate_set_parameter downtime-limit 300
> >>   migrate_set_parameter switchover-limit 10
> > 
> > What this means is that in practice the total downtime limit
> > is 310 ms, however, expressing this as two parameters is
> > incredibly inflexible.
> > 
> > If the actual RAM transfer downtime only took 50 ms, then why
> > should the switchover downtime still be limited to 10ms, when
> > we've still got a budget of 250 ms that was unused.
> > 
> 
> The downtime limit is 300, it's more than you are giving something *extra* 10ms
> when you switchover regardless of where that's spent.
> 
> If it makes it easier to understand you could see this parameter as:
> 
> 'downtime-limit-max-error' = 10 ms
> 
> The name as proposed by the RFC was meant to honor what the error margin was
> meant for: to account for extra time during switchover. Adding this inside
> downtime-limit wouldn't work as it otherwise would be used solely for RAM
> transfer during precopy.
> 
> > IOW, if my VM tolerates a downtime of 310ms, then I want that
> > 310ms spread across the RAM transfer downtime and switchover
> > downtime in *any* ratio. ALl that matters is the overall
> > completion time.
> > 
> That still happens with this patches, no specific budget is given to each.

If no specific budget is given to each, then IMHO adding the second
parameter is pointless & misleading. 

With regards,
Daniel
Joao Martins June 26, 2024, 12:12 p.m. UTC | #11
On 26/06/2024 12:34, Daniel P. Berrangé wrote:
> On Wed, Jun 26, 2024 at 12:29:41PM +0100, Joao Martins wrote:
>> On 25/06/2024 19:37, Daniel P. Berrangé wrote:
>>> On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote:
>>>> Then the question is how should we suggest the user to specify these two
>>>> parameters.
>>>>
>>>> The cover letter used:
>>>>
>>>>   migrate_set_parameter downtime-limit 300
>>>>   migrate_set_parameter switchover-limit 10
>>>
>>> What this means is that in practice the total downtime limit
>>> is 310 ms, however, expressing this as two parameters is
>>> incredibly inflexible.
>>>
>>> If the actual RAM transfer downtime only took 50 ms, then why
>>> should the switchover downtime still be limited to 10ms, when
>>> we've still got a budget of 250 ms that was unused.
>>>
>>
>> The downtime limit is 300, it's more than you are giving something *extra* 10ms
>> when you switchover regardless of where that's spent.
>>
>> If it makes it easier to understand you could see this parameter as:
>>
>> 'downtime-limit-max-error' = 10 ms
>>
>> The name as proposed by the RFC was meant to honor what the error margin was
>> meant for: to account for extra time during switchover. Adding this inside
>> downtime-limit wouldn't work as it otherwise would be used solely for RAM
>> transfer during precopy.
>>
>>> IOW, if my VM tolerates a downtime of 310ms, then I want that
>>> 310ms spread across the RAM transfer downtime and switchover
>>> downtime in *any* ratio. ALl that matters is the overall
>>> completion time.
>>>
>> That still happens with this patches, no specific budget is given to each.
> 
> If no specific budget is given to each, then IMHO adding the second
> parameter is pointless & misleading. 

That is contradictory with your earlier statement.

You redacted the part where I describe how this works in *the worst case* if the
entire downtime-limit is used for RAM transfer then the switchover-limit might
*implicitly* act as an budget:

| Though implicitly if downtime-limit captures only RAM transfer, then in theory
| if you're migrating a busy guest that happens to meet the SLA say
| expected-downtime=290, then you have a total of 20 for switchover (thanks to
| the extra 10 used in switchover-limit/downtime-limit-max-error 10).

I am confused with what to make here. If budget is bad because any ratio should
be used if available, but then the added parameter doesn't care about ratios
specifically but *can* act as switchover ratio when RAM dominates
downtime-limit. But now no budget is associated is also bad ... then what's your
middle ground from your point of view to tackle switchover downtime being
somehow accounted?
Daniel P. Berrangé June 26, 2024, 12:20 p.m. UTC | #12
On Wed, Jun 26, 2024 at 01:12:15PM +0100, Joao Martins wrote:
> On 26/06/2024 12:34, Daniel P. Berrangé wrote:
> > On Wed, Jun 26, 2024 at 12:29:41PM +0100, Joao Martins wrote:
> >> On 25/06/2024 19:37, Daniel P. Berrangé wrote:
> >>> On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote:
> >>>> Then the question is how should we suggest the user to specify these two
> >>>> parameters.
> >>>>
> >>>> The cover letter used:
> >>>>
> >>>>   migrate_set_parameter downtime-limit 300
> >>>>   migrate_set_parameter switchover-limit 10
> >>>
> >>> What this means is that in practice the total downtime limit
> >>> is 310 ms, however, expressing this as two parameters is
> >>> incredibly inflexible.
> >>>
> >>> If the actual RAM transfer downtime only took 50 ms, then why
> >>> should the switchover downtime still be limited to 10ms, when
> >>> we've still got a budget of 250 ms that was unused.
> >>>
> >>
> >> The downtime limit is 300, it's more than you are giving something *extra* 10ms
> >> when you switchover regardless of where that's spent.
> >>
> >> If it makes it easier to understand you could see this parameter as:
> >>
> >> 'downtime-limit-max-error' = 10 ms
> >>
> >> The name as proposed by the RFC was meant to honor what the error margin was
> >> meant for: to account for extra time during switchover. Adding this inside
> >> downtime-limit wouldn't work as it otherwise would be used solely for RAM
> >> transfer during precopy.
> >>
> >>> IOW, if my VM tolerates a downtime of 310ms, then I want that
> >>> 310ms spread across the RAM transfer downtime and switchover
> >>> downtime in *any* ratio. ALl that matters is the overall
> >>> completion time.
> >>>
> >> That still happens with this patches, no specific budget is given to each.
> > 
> > If no specific budget is given to each, then IMHO adding the second
> > parameter is pointless & misleading. 
> 
> That is contradictory with your earlier statement.

I don't think it is.

> You redacted the part where I describe how this works in *the worst case* if the
> entire downtime-limit is used for RAM transfer then the switchover-limit might
> *implicitly* act as an budget:
> 
> | Though implicitly if downtime-limit captures only RAM transfer, then in theory
> | if you're migrating a busy guest that happens to meet the SLA say
> | expected-downtime=290, then you have a total of 20 for switchover (thanks to
> | the extra 10 used in switchover-limit/downtime-limit-max-error 10).
> 
> I am confused with what to make here. If budget is bad because any ratio should
> be used if available, but then the added parameter doesn't care about ratios
> specifically but *can* act as switchover ratio when RAM dominates
> downtime-limit. But now no budget is associated is also bad ... then what's your
> middle ground from your point of view to tackle switchover downtime being
> somehow accounted?

The pre-existing 'downtime-limit' value should apply to anything that
happens between src CPUs stopping, and dst CPUs starting. If we were
not correctly accounting for some parts, we just need to fix that
accounting, without adding extra time parameters.

With regards,
Daniel
Peter Xu June 26, 2024, 6:41 p.m. UTC | #13
On Wed, Jun 26, 2024 at 12:04:43PM +0100, Joao Martins wrote:
> Are you thinking in something specifically?

Not really. I don't think I have any idea on how to make it better,
unfortunately, but we did some measurement too quite some time ago and I
can share some below.

> 
> Many "variables" affect this from the point we decide switchover, and at the
> worst (likely) case it means having qemu subsystems declare empirical values on
> how long it takes to suspend/resume/transfer-state to migration expected
> downtime prediction equation. Part of the reason that having headroom within
> downtime-limit was a simple 'catch-all' (from our PoV) in terms of
> maintainability while giving user something to fallback for characterizing its
> SLA.

Yes, I think this might be a way to go, by starting with something that can
catch all.

> Personally, I think there's a tiny bit disconnect between what the user
> desires when setting downtime-limit vs what it really does. downtime-limit right
> now looks to be best viewed as 'precopy-ram-downtime-limit' :)

That's fair to say indeed.. QEMU can try to do better on this, it's just
not yet straightforward to know how.

> Unless the accuracy work you're thinking is just having a better
> migration algorithm at obtaining the best possible downtime for
> outstanding-data/RAM *even if* downtime-limit is set at a high limit,
> like giving 1) a grace period in the beginning of migration post first
> dirty sync

Can you elaborate on this one a bit?

> or 2) a measured value with continually incrementing target downtime
> limit until max downtime-limit set by user hits ... before defaulting to
> the current behaviour of migrating as soon as expected downtime is within
> the downtime-limit. As discussed in the last response, this could create
> the 'downtime headroom' for getting the enforcement/SLA better
> honored. Is this maybe your line of thinking?

Not what I was referring, but I think such logic existed for years, it was
just not implemented in QEMU.  I know at least OpenStack implemented
exactly that, where instead of keeping an internal smaller downtime_limit
and keep increasing that, OpenStack will keep adjusting downtime_limit
parameter from time to time, starting with a relatively low value.

That is also what I would suggest to most people who cares about downtime,
because QEMU does treat it pretty simple: if QEMU thinks it can switchover
within the downtime specified, QEMU will just do it, even if it's not the
best it can do.

Do you think such idea should be instead implemented in QEMU, too?  Note
that this will also be not about "making downtime accurate", but "reducing
downtime", it may depend on how we define downtime_limit in the context,
perhaps, where in OpenStack's case it simply won't directly feed that
parameter with the real max downtime the user allows.

Since that wasn't my original purpose, what I meant is simply see ways to
make downtime_limit accurate, and by analyzing the current downtimes (as
you mentioned, using the downtime tracepoints; and I'd say kudos to you on
suggesting that in a formal patch).

Here's something we collected by our QE team, for example, on a pretty
loaded system of 384 cores + 12TB:

Checkpoints analysis:

            downtime-start ->               vm-stopped:             267635.2 (us)
                vm-stopped ->           iterable-saved:            3558506.2 (us)
            iterable-saved ->       non-iterable-saved:             270352.2 (us)
        non-iterable-saved ->             downtime-end:             144264.2 (us)
                                        total downtime:            4240758.0 (us)

Iterable device analysis:

  Device SAVE of                                      ram:  0 took    3470420 (us)

Non-iterable device analysis:

  Device SAVE of                                      cpu:121 took     118090 (us)
  Device SAVE of                                     apic:167 took       6899 (us)
  Device SAVE of                                      cpu:296 took       3795 (us)
  Device SAVE of             0000:00:02.2:00.0/virtio-blk:  0 took        638 (us)
  Device SAVE of                                      cpu:213 took        630 (us)
  Device SAVE of             0000:00:02.0:00.0/virtio-net:  0 took        534 (us)
  Device SAVE of                                      cpu:374 took        517 (us)
  Device SAVE of                                      cpu: 31 took        503 (us)
  Device SAVE of                                      cpu:346 took        497 (us)
  Device SAVE of             0000:00:02.0:00.1/virtio-net:  0 took        492 (us)
  (1341 vmsd omitted)

In this case we also see the SLA violations since we specified something
much lower than 4.2sec as downtime_limit.

This might not be a good example, as I think when capturing the traces we
used to still have the issue on inaccurate bw estimations, and that was why
I introduced switchover-bandwidth parameter, I wished after that the result
can be closer to downtime_limit but we never tried to test again.  I am not
sure either on whether that's the best way to address this.

But let's just ignore the iterable save() huge delays (which can be
explained, and hopefully will still be covered by downtime_limit
calculations when it can try to get closer to right), and we can also see
at least a few things we didn't account:

  - stop vm: 268ms
  - non-iterables: 270ms
  - dest load until complete: 144ms

For the last one, we did see another outlier where it can only be seen from
dest:

Non-iterable device analysis:

  Device LOAD of                              kvm-tpr-opt:  0 took     123976 (us)  <----- this one
  Device LOAD of              0000:00:02.0/pcie-root-port:  0 took       6362 (us)
  Device LOAD of             0000:00:02.0:00.0/virtio-net:  0 took       4583 (us)
  Device LOAD of             0000:00:02.0:00.1/virtio-net:  0 took       4440 (us)
  Device LOAD of                         0000:00:01.0/vga:  0 took       3740 (us)
  Device LOAD of                         0000:00:00.0/mch:  0 took       3557 (us)
  Device LOAD of             0000:00:02.2:00.0/virtio-blk:  0 took       3530 (us)
  Device LOAD of                   0000:00:02.1:00.0/xhci:  0 took       2712 (us)
  Device LOAD of              0000:00:02.1/pcie-root-port:  0 took       2046 (us)
  Device LOAD of              0000:00:02.2/pcie-root-port:  0 took       1890 (us)

So we found either cpu save() taking 100+ms, or kvm-tpr-opt load() taking
100+ms.  None of them sounds normal, and I didn't look into them.

Now with a global ratio perhaps start to reflect "how much ratio of
downtime_limit should we account into data transfer", then we'll also need
to answer how the user should set that ratio value, and maybe there's a
sane way to calculate that by the VM setup?  I'm not sure, but those
questions may need to be answered together in the next post, so that such
parameter can be consumable.

The answer doesn't need to be accurate, but I hope that can be based on
some similar analysis like above (where I didn't do it well; as I don't
think I looked into any of the issues, and maybe they're fix-able).  But
just to show what I meant.  It'll be also great when doing the analysis we
found issues fix-able, then it'll be great we fix the issues intead.
That's the part when I mentioned "I still prefer fixing downtime_limit
itself".

Thanks,
Elena Ufimtseva July 26, 2024, 7:41 a.m. UTC | #14
On Wed, Jun 26, 2024 at 02:41:34PM -0400, Peter Xu wrote:
> On Wed, Jun 26, 2024 at 12:04:43PM +0100, Joao Martins wrote:
> > Are you thinking in something specifically?
> 
> Not really. I don't think I have any idea on how to make it better,
> unfortunately, but we did some measurement too quite some time ago and I
> can share some below.


Hello Peter

I apologize for such a long delay with the reply.

> 
> > 
> > Many "variables" affect this from the point we decide switchover, and at the
> > worst (likely) case it means having qemu subsystems declare empirical values on
> > how long it takes to suspend/resume/transfer-state to migration expected
> > downtime prediction equation. Part of the reason that having headroom within
> > downtime-limit was a simple 'catch-all' (from our PoV) in terms of
> > maintainability while giving user something to fallback for characterizing its
> > SLA.
> 
> Yes, I think this might be a way to go, by starting with something that can
> catch all.


Possibly the title "strict SLA" is not the best choice of
words as it creates impression that the guarantees will be met.
But essentially this switchover limit is a safeguard against the unknowns
that can contribute to the downtime during the stop-copy and can be not
that easy to account for (or even impossible due to hardware
implementation or other issues).

To show what kind of statistics we see in our environments and what
are the main contributors please see below.

Example 1: host migration, default downtime set to 300:

Checkpoints analysis:

  checkpoint=src-downtime-start -> checkpoint=src-vm-stopped:                   74244 (us)
  checkpoint=src-vm-stopped -> checkpoint=src-iterable-saved:                   154493 (us)
  checkpoint=src-iterable-saved -> checkpoint=src-non-iterable-saved:           4746 (us)
  checkpoint=src-non-iterable-saved -> checkpoint=dst-precopy-loadvm-completed: 224981 (us)
  checkpoint=dst-precopy-loadvm-completed -> checkpoint=dst-precopy-bh-enter:   36 (us)
  checkpoint=dst-precopy-bh-enter -> checkpoint=dst-precopy-bh-announced:       7859 (us)
  checkpoint=dst-precopy-bh-announced -> checkpoint=dst-precopy-bh-vm-started:  15995 (us)
  checkpoint=dst-precopy-bh-vm-started -> checkpoint=src-downtime-end:          236 (us)

Iterable device analysis:

  Device SAVE of                                      ram:  0 took     151054 (us)
  Device LOAD of                                      ram:  0 took     146855 (us)
  Device SAVE of              0000:20:04.0:00.0:00.0/vfio:  0 took       2127 (us)
  Device LOAD of              0000:20:04.0:00.0:00.0/vfio:  0 took     144202 (us)

Non-iterable device analysis:

  Device LOAD of              0000:20:04.0:00.0:00.0/vfio:  0 took      67470 (us)
  Device LOAD of                         0000:00:01.0/vga:  0 took       7527 (us)
  Device LOAD of                      0000:00:02.0/e1000e:  0 took       1715 (us)
  Device LOAD of                              kvm-tpr-opt:  0 took       1697 (us)
  Device LOAD of                  0000:00:03.0/virtio-blk:  0 took       1340 (us)
  Device SAVE of                      0000:00:02.0/e1000e:  0 took       1036 (us)
  Device LOAD of                         0000:00:00.0/mch:  0 took       1035 (us)
  Device LOAD of         0000:20:04.0:00.0/pcie-root-port:  0 took        976 (us)
  Device LOAD of                     0000:00:1f.0/ICH9LPC:  0 took        851 (us)
  Device LOAD of                   0000:00:1f.2/ich9_ahci:  0 took        578 (us)

(qemu) info migrate
globals:
store-global-state: on
only-migratable: off
send-configuration: on
send-section-footer: on
decompress-error-check: on
clear-bitmap-shift: 18
Migration status: completed
total time: 5927 ms
downtime: 483 ms
setup: 78 ms
transferred ram: 883709 kbytes
throughput: 1237.71 mbps
remaining ram: 0 kbytes
total ram: 33571656 kbytes
duplicate: 8192488 pages
skipped: 0 pages
normal: 201300 pages
normal bytes: 805200 kbytes
dirty sync count: 3
page size: 4 kbytes
multifd bytes: 0 kbytes
pages-per-second: 958776
precopy ram: 480464 kbytes
downtime ram: 398313 kbytes
vfio device transferred: 4496 kbytes

Example 2: different system than above,  live migration over 100Gbit/s 
connection and 2 vfio virtual functions (the guest has no workload and
vfio devices are not engaged in VM and have same amount of data to live
migrate).

Displayed outliers that are larger than 3 us.

Save:
252812@1721976657.700972:vmstate_downtime_checkpoint src-downtime-start
252812@1721976657.829180:vmstate_downtime_checkpoint src-vm-stopped
252812@1721976657.967987:vmstate_downtime_save type=iterable idstr=ram instance_id=0 downtime=138005
252812@1721976658.093218:vmstate_downtime_save type=iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=125188
252812@1721976658.318101:vmstate_downtime_save type=iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=224857
252812@1721976658.318125:vmstate_downtime_checkpoint src-iterable-saved
...

Load:
353062@1721976488.995582:vmstate_downtime_load type=iterable idstr=ram instance_id=0 downtime=117294
353062@1721976489.235227:vmstate_downtime_load type=iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=239586
353062@1721976489.449736:vmstate_downtime_load type=iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=214462
353062@1721976489.463260:vmstate_downtime_load type=non-iterable idstr=0000:00:01.0/vga instance_id=0 downtime=7522
353062@1721976489.575383:vmstate_downtime_load type=non-iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=112113
353062@1721976489.686961:vmstate_downtime_load type=non-iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=111545
...

(qemu) info migrate
globals:
store-global-state: on
only-migratable: off
send-configuration: on
send-section-footer: on
decompress-error-check: on
clear-bitmap-shift: 18
Migration status: completed
total time: 23510 ms
downtime: 1018 ms
setup: 380 ms
transferred ram: 5317587 kbytes
throughput: 1883.34 mbps
remaining ram: 0 kbytes
total ram: 209732424 kbytes
duplicate: 51628634 pages
skipped: 0 pages
normal: 1159697 pages
normal bytes: 4638788 kbytes
dirty sync count: 4
page size: 4 kbytes
multifd bytes: 4653988 kbytes
pages-per-second: 1150272
precopy ram: 453652 kbytes
downtime ram: 118 kbytes
vfio device transferred: 209431 kbytes

As it can be seen from above, the downtime gets violated and the main
contributors are vfio devices. Also it can very depending on the
firmare version.
I have to note that in one of 10 tests, the ram downtime is being much
larger and becomes then the outlier. This is being investigated
currently.

This switchover overshoot is usually being reported as timed out queries.
And to comment on precision, yes, this overshoot safeguard is not
precise.
In fact, the current implementation is not precise and has a granularity of the savevm_
handlers as it would only check the downtime overshoot after it is being
completed. Maybe this part can be improved by delegating to some of the known
abusers to observe the downtime overshoot on its own.

> 
> > Personally, I think there's a tiny bit disconnect between what the user
> > desires when setting downtime-limit vs what it really does. downtime-limit right
> > now looks to be best viewed as 'precopy-ram-downtime-limit' :)
> 
> That's fair to say indeed.. QEMU can try to do better on this, it's just
> not yet straightforward to know how.

Could be that the better known part of the downtime (and predictable
taken the bandwidth is accurate), i.e. currently what is downtime-limit, serve
as a starting point and be named as ram-downtime-limit and everything
else would have switchover allowance of downtime_limit - ram-downtime-limit?
The correct value of ram-downtime-limit would take few iterations
after dirty sync to get established.

I think that is somewhat similar to what Joao is thinking below in 1) and 2)?

Thanks!

Elena
> 
> > Unless the accuracy work you're thinking is just having a better
> > migration algorithm at obtaining the best possible downtime for
> > outstanding-data/RAM *even if* downtime-limit is set at a high limit,
> > like giving 1) a grace period in the beginning of migration post first
> > dirty sync
> 
> Can you elaborate on this one a bit?

> 
> > or 2) a measured value with continually incrementing target downtime
> > limit until max downtime-limit set by user hits ... before defaulting to
> > the current behaviour of migrating as soon as expected downtime is within
> > the downtime-limit. As discussed in the last response, this could create
> > the 'downtime headroom' for getting the enforcement/SLA better
> > honored. Is this maybe your line of thinking?
>

> Not what I was referring, but I think such logic existed for years, it was
> just not implemented in QEMU.  I know at least OpenStack implemented
> exactly that, where instead of keeping an internal smaller downtime_limit
> and keep increasing that, OpenStack will keep adjusting downtime_limit
> parameter from time to time, starting with a relatively low value.
> 
> That is also what I would suggest to most people who cares about downtime,
> because QEMU does treat it pretty simple: if QEMU thinks it can switchover
> within the downtime specified, QEMU will just do it, even if it's not the
> best it can do.
> 
> Do you think such idea should be instead implemented in QEMU, too?  Note
> that this will also be not about "making downtime accurate", but "reducing
> downtime", it may depend on how we define downtime_limit in the context,
> perhaps, where in OpenStack's case it simply won't directly feed that
> parameter with the real max downtime the user allows.
> 
> Since that wasn't my original purpose, what I meant is simply see ways to
> make downtime_limit accurate, and by analyzing the current downtimes (as
> you mentioned, using the downtime tracepoints; and I'd say kudos to you on
> suggesting that in a formal patch).
> 
> Here's something we collected by our QE team, for example, on a pretty
> loaded system of 384 cores + 12TB:
> 
> Checkpoints analysis:
> 
>             downtime-start ->               vm-stopped:             267635.2 (us)
>                 vm-stopped ->           iterable-saved:            3558506.2 (us)
>             iterable-saved ->       non-iterable-saved:             270352.2 (us)
>         non-iterable-saved ->             downtime-end:             144264.2 (us)
>                                         total downtime:            4240758.0 (us)
> 
> Iterable device analysis:
> 
>   Device SAVE of                                      ram:  0 took    3470420 (us)
> 
> Non-iterable device analysis:
> 
>   Device SAVE of                                      cpu:121 took     118090 (us)
>   Device SAVE of                                     apic:167 took       6899 (us)
>   Device SAVE of                                      cpu:296 took       3795 (us)
>   Device SAVE of             0000:00:02.2:00.0/virtio-blk:  0 took        638 (us)
>   Device SAVE of                                      cpu:213 took        630 (us)
>   Device SAVE of             0000:00:02.0:00.0/virtio-net:  0 took        534 (us)
>   Device SAVE of                                      cpu:374 took        517 (us)
>   Device SAVE of                                      cpu: 31 took        503 (us)
>   Device SAVE of                                      cpu:346 took        497 (us)
>   Device SAVE of             0000:00:02.0:00.1/virtio-net:  0 took        492 (us)
>   (1341 vmsd omitted)
> 
> In this case we also see the SLA violations since we specified something
> much lower than 4.2sec as downtime_limit.
> 
> This might not be a good example, as I think when capturing the traces we
> used to still have the issue on inaccurate bw estimations, and that was why
> I introduced switchover-bandwidth parameter, I wished after that the result
> can be closer to downtime_limit but we never tried to test again.  I am not
> sure either on whether that's the best way to address this.
> 
> But let's just ignore the iterable save() huge delays (which can be
> explained, and hopefully will still be covered by downtime_limit
> calculations when it can try to get closer to right), and we can also see
> at least a few things we didn't account:
> 
>   - stop vm: 268ms
>   - non-iterables: 270ms
>   - dest load until complete: 144ms
> 
> For the last one, we did see another outlier where it can only be seen from
> dest:
> 
> Non-iterable device analysis:
> 
>   Device LOAD of                              kvm-tpr-opt:  0 took     123976 (us)  <----- this one
>   Device LOAD of              0000:00:02.0/pcie-root-port:  0 took       6362 (us)
>   Device LOAD of             0000:00:02.0:00.0/virtio-net:  0 took       4583 (us)
>   Device LOAD of             0000:00:02.0:00.1/virtio-net:  0 took       4440 (us)
>   Device LOAD of                         0000:00:01.0/vga:  0 took       3740 (us)
>   Device LOAD of                         0000:00:00.0/mch:  0 took       3557 (us)
>   Device LOAD of             0000:00:02.2:00.0/virtio-blk:  0 took       3530 (us)
>   Device LOAD of                   0000:00:02.1:00.0/xhci:  0 took       2712 (us)
>   Device LOAD of              0000:00:02.1/pcie-root-port:  0 took       2046 (us)
>   Device LOAD of              0000:00:02.2/pcie-root-port:  0 took       1890 (us)
> 
> So we found either cpu save() taking 100+ms, or kvm-tpr-opt load() taking
> 100+ms.  None of them sounds normal, and I didn't look into them.
>



> Now with a global ratio perhaps start to reflect "how much ratio of
> downtime_limit should we account into data transfer", then we'll also need
> to answer how the user should set that ratio value, and maybe there's a
> sane way to calculate that by the VM setup?  I'm not sure, but those
> questions may need to be answered together in the next post, so that such
> parameter can be consumable.

> 
> The answer doesn't need to be accurate, but I hope that can be based on
> some similar analysis like above (where I didn't do it well; as I don't
> think I looked into any of the issues, and maybe they're fix-able).  But
> just to show what I meant.  It'll be also great when doing the analysis we
> found issues fix-able, then it'll be great we fix the issues intead.
> That's the part when I mentioned "I still prefer fixing downtime_limit
> itself".
> 
> Thanks,
> 
> -- 
> Peter Xu
>
Peter Xu Aug. 21, 2024, 7:45 p.m. UTC | #15
On Fri, Jul 26, 2024 at 12:41:57AM -0700, Elena Ufimtseva wrote:
> On Wed, Jun 26, 2024 at 02:41:34PM -0400, Peter Xu wrote:
> > On Wed, Jun 26, 2024 at 12:04:43PM +0100, Joao Martins wrote:
> > > Are you thinking in something specifically?
> > 
> > Not really. I don't think I have any idea on how to make it better,
> > unfortunately, but we did some measurement too quite some time ago and I
> > can share some below.
> 
> 
> Hello Peter
> 
> I apologize for such a long delay with the reply.

I apologize back.. :)

> 
> > 
> > > 
> > > Many "variables" affect this from the point we decide switchover, and at the
> > > worst (likely) case it means having qemu subsystems declare empirical values on
> > > how long it takes to suspend/resume/transfer-state to migration expected
> > > downtime prediction equation. Part of the reason that having headroom within
> > > downtime-limit was a simple 'catch-all' (from our PoV) in terms of
> > > maintainability while giving user something to fallback for characterizing its
> > > SLA.
> > 
> > Yes, I think this might be a way to go, by starting with something that can
> > catch all.
> 
> 
> Possibly the title "strict SLA" is not the best choice of
> words as it creates impression that the guarantees will be met.
> But essentially this switchover limit is a safeguard against the unknowns
> that can contribute to the downtime during the stop-copy and can be not
> that easy to account for (or even impossible due to hardware
> implementation or other issues).
> 
> To show what kind of statistics we see in our environments and what
> are the main contributors please see below.
> 
> Example 1: host migration, default downtime set to 300:
> 
> Checkpoints analysis:
> 
>   checkpoint=src-downtime-start -> checkpoint=src-vm-stopped:                   74244 (us)
>   checkpoint=src-vm-stopped -> checkpoint=src-iterable-saved:                   154493 (us)
>   checkpoint=src-iterable-saved -> checkpoint=src-non-iterable-saved:           4746 (us)
>   checkpoint=src-non-iterable-saved -> checkpoint=dst-precopy-loadvm-completed: 224981 (us)

This is the first cross-host time diff, I assume the two hosts are
synchronized and the time diff is pretty low.

I guess part like this is not accounted in the switchover decision by src
QEMU, with the hope that after src QEMU sent everything with the estimated
bandwidth the dest QEMU should have (mostly) loaded everything.

It might be interesting to know when this can grow much larger than we
expect.  More below..

>   checkpoint=dst-precopy-loadvm-completed -> checkpoint=dst-precopy-bh-enter:   36 (us)
>   checkpoint=dst-precopy-bh-enter -> checkpoint=dst-precopy-bh-announced:       7859 (us)
>   checkpoint=dst-precopy-bh-announced -> checkpoint=dst-precopy-bh-vm-started:  15995 (us)
>   checkpoint=dst-precopy-bh-vm-started -> checkpoint=src-downtime-end:          236 (us)
> 
> Iterable device analysis:
> 
>   Device SAVE of                                      ram:  0 took     151054 (us)
>   Device LOAD of                                      ram:  0 took     146855 (us)
>   Device SAVE of              0000:20:04.0:00.0:00.0/vfio:  0 took       2127 (us)
>   Device LOAD of              0000:20:04.0:00.0:00.0/vfio:  0 took     144202 (us)
> 
> Non-iterable device analysis:
> 
>   Device LOAD of              0000:20:04.0:00.0:00.0/vfio:  0 took      67470 (us)
>   Device LOAD of                         0000:00:01.0/vga:  0 took       7527 (us)
>   Device LOAD of                      0000:00:02.0/e1000e:  0 took       1715 (us)
>   Device LOAD of                              kvm-tpr-opt:  0 took       1697 (us)
>   Device LOAD of                  0000:00:03.0/virtio-blk:  0 took       1340 (us)
>   Device SAVE of                      0000:00:02.0/e1000e:  0 took       1036 (us)
>   Device LOAD of                         0000:00:00.0/mch:  0 took       1035 (us)
>   Device LOAD of         0000:20:04.0:00.0/pcie-root-port:  0 took        976 (us)
>   Device LOAD of                     0000:00:1f.0/ICH9LPC:  0 took        851 (us)
>   Device LOAD of                   0000:00:1f.2/ich9_ahci:  0 took        578 (us)
> 
> (qemu) info migrate
> globals:
> store-global-state: on
> only-migratable: off
> send-configuration: on
> send-section-footer: on
> decompress-error-check: on
> clear-bitmap-shift: 18
> Migration status: completed
> total time: 5927 ms
> downtime: 483 ms
> setup: 78 ms
> transferred ram: 883709 kbytes
> throughput: 1237.71 mbps
> remaining ram: 0 kbytes
> total ram: 33571656 kbytes
> duplicate: 8192488 pages
> skipped: 0 pages
> normal: 201300 pages
> normal bytes: 805200 kbytes
> dirty sync count: 3
> page size: 4 kbytes
> multifd bytes: 0 kbytes
> pages-per-second: 958776
> precopy ram: 480464 kbytes
> downtime ram: 398313 kbytes
> vfio device transferred: 4496 kbytes
> 
> Example 2: different system than above,  live migration over 100Gbit/s 
> connection and 2 vfio virtual functions (the guest has no workload and
> vfio devices are not engaged in VM and have same amount of data to live
> migrate).
> 
> Displayed outliers that are larger than 3 us.
> 
> Save:
> 252812@1721976657.700972:vmstate_downtime_checkpoint src-downtime-start
> 252812@1721976657.829180:vmstate_downtime_checkpoint src-vm-stopped
> 252812@1721976657.967987:vmstate_downtime_save type=iterable idstr=ram instance_id=0 downtime=138005
> 252812@1721976658.093218:vmstate_downtime_save type=iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=125188
> 252812@1721976658.318101:vmstate_downtime_save type=iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=224857
> 252812@1721976658.318125:vmstate_downtime_checkpoint src-iterable-saved
> ...
> 
> Load:
> 353062@1721976488.995582:vmstate_downtime_load type=iterable idstr=ram instance_id=0 downtime=117294
> 353062@1721976489.235227:vmstate_downtime_load type=iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=239586
> 353062@1721976489.449736:vmstate_downtime_load type=iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=214462
> 353062@1721976489.463260:vmstate_downtime_load type=non-iterable idstr=0000:00:01.0/vga instance_id=0 downtime=7522
> 353062@1721976489.575383:vmstate_downtime_load type=non-iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=112113
> 353062@1721976489.686961:vmstate_downtime_load type=non-iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=111545
> ...
> 
> (qemu) info migrate
> globals:
> store-global-state: on
> only-migratable: off
> send-configuration: on
> send-section-footer: on
> decompress-error-check: on
> clear-bitmap-shift: 18
> Migration status: completed
> total time: 23510 ms
> downtime: 1018 ms
> setup: 380 ms
> transferred ram: 5317587 kbytes
> throughput: 1883.34 mbps
> remaining ram: 0 kbytes
> total ram: 209732424 kbytes
> duplicate: 51628634 pages
> skipped: 0 pages
> normal: 1159697 pages
> normal bytes: 4638788 kbytes
> dirty sync count: 4
> page size: 4 kbytes
> multifd bytes: 4653988 kbytes
> pages-per-second: 1150272
> precopy ram: 453652 kbytes
> downtime ram: 118 kbytes
> vfio device transferred: 209431 kbytes
> 
> As it can be seen from above, the downtime gets violated and the main
> contributors are vfio devices. Also it can very depending on the
> firmare version.

So have you figured out how VFIO affected this?  I thought VFIO will also
report pending information just like RAM, in ->state_pending_exact().

However I don't think such reports involves things like read()/write() over
the VFIO fds.  I assume that can be a major issue that won't be captured by
any form of pending data reports.  It might be useful to first identify /
verify this issue, then think about how to improve the estimation of
downtimes, e.g. by reporting "extra time" (in this case, the time to do
read() / write() from the device driver) needed for the final dump stage
for either iterative/non-iterative data.

> I have to note that in one of 10 tests, the ram downtime is being much
> larger and becomes then the outlier. This is being investigated
> currently.
> 
> This switchover overshoot is usually being reported as timed out queries.
> And to comment on precision, yes, this overshoot safeguard is not
> precise.
> In fact, the current implementation is not precise and has a granularity of the savevm_
> handlers as it would only check the downtime overshoot after it is being
> completed. Maybe this part can be improved by delegating to some of the known
> abusers to observe the downtime overshoot on its own.
> 
> > 
> > > Personally, I think there's a tiny bit disconnect between what the user
> > > desires when setting downtime-limit vs what it really does. downtime-limit right
> > > now looks to be best viewed as 'precopy-ram-downtime-limit' :)
> > 
> > That's fair to say indeed.. QEMU can try to do better on this, it's just
> > not yet straightforward to know how.
> 
> Could be that the better known part of the downtime (and predictable
> taken the bandwidth is accurate), i.e. currently what is downtime-limit, serve
> as a starting point and be named as ram-downtime-limit and everything
> else would have switchover allowance of downtime_limit - ram-downtime-limit?
> The correct value of ram-downtime-limit would take few iterations
> after dirty sync to get established.
> 
> I think that is somewhat similar to what Joao is thinking below in 1) and 2)?

I'm personally curious on whether you can nail the current issues down to
VFIO, and if so whether there can be something we can do already to improve
downtime estimation when VFIO is involved, irrelevant of the rest.

For a generic idea of improving downtime, I think as long as we stick with
downtime_limit parameter untouched (reflecting the total & max downtime),
then we can add some new ideas on top.

Thanks,

> 
> Thanks!
> 
> Elena
> > 
> > > Unless the accuracy work you're thinking is just having a better
> > > migration algorithm at obtaining the best possible downtime for
> > > outstanding-data/RAM *even if* downtime-limit is set at a high limit,
> > > like giving 1) a grace period in the beginning of migration post first
> > > dirty sync
> > 
> > Can you elaborate on this one a bit?
> 
> > 
> > > or 2) a measured value with continually incrementing target downtime
> > > limit until max downtime-limit set by user hits ... before defaulting to
> > > the current behaviour of migrating as soon as expected downtime is within
> > > the downtime-limit. As discussed in the last response, this could create
> > > the 'downtime headroom' for getting the enforcement/SLA better
> > > honored. Is this maybe your line of thinking?
> >
> 
> > Not what I was referring, but I think such logic existed for years, it was
> > just not implemented in QEMU.  I know at least OpenStack implemented
> > exactly that, where instead of keeping an internal smaller downtime_limit
> > and keep increasing that, OpenStack will keep adjusting downtime_limit
> > parameter from time to time, starting with a relatively low value.
> > 
> > That is also what I would suggest to most people who cares about downtime,
> > because QEMU does treat it pretty simple: if QEMU thinks it can switchover
> > within the downtime specified, QEMU will just do it, even if it's not the
> > best it can do.
> > 
> > Do you think such idea should be instead implemented in QEMU, too?  Note
> > that this will also be not about "making downtime accurate", but "reducing
> > downtime", it may depend on how we define downtime_limit in the context,
> > perhaps, where in OpenStack's case it simply won't directly feed that
> > parameter with the real max downtime the user allows.
> > 
> > Since that wasn't my original purpose, what I meant is simply see ways to
> > make downtime_limit accurate, and by analyzing the current downtimes (as
> > you mentioned, using the downtime tracepoints; and I'd say kudos to you on
> > suggesting that in a formal patch).
> > 
> > Here's something we collected by our QE team, for example, on a pretty
> > loaded system of 384 cores + 12TB:
> > 
> > Checkpoints analysis:
> > 
> >             downtime-start ->               vm-stopped:             267635.2 (us)
> >                 vm-stopped ->           iterable-saved:            3558506.2 (us)
> >             iterable-saved ->       non-iterable-saved:             270352.2 (us)
> >         non-iterable-saved ->             downtime-end:             144264.2 (us)
> >                                         total downtime:            4240758.0 (us)
> > 
> > Iterable device analysis:
> > 
> >   Device SAVE of                                      ram:  0 took    3470420 (us)
> > 
> > Non-iterable device analysis:
> > 
> >   Device SAVE of                                      cpu:121 took     118090 (us)
> >   Device SAVE of                                     apic:167 took       6899 (us)
> >   Device SAVE of                                      cpu:296 took       3795 (us)
> >   Device SAVE of             0000:00:02.2:00.0/virtio-blk:  0 took        638 (us)
> >   Device SAVE of                                      cpu:213 took        630 (us)
> >   Device SAVE of             0000:00:02.0:00.0/virtio-net:  0 took        534 (us)
> >   Device SAVE of                                      cpu:374 took        517 (us)
> >   Device SAVE of                                      cpu: 31 took        503 (us)
> >   Device SAVE of                                      cpu:346 took        497 (us)
> >   Device SAVE of             0000:00:02.0:00.1/virtio-net:  0 took        492 (us)
> >   (1341 vmsd omitted)
> > 
> > In this case we also see the SLA violations since we specified something
> > much lower than 4.2sec as downtime_limit.
> > 
> > This might not be a good example, as I think when capturing the traces we
> > used to still have the issue on inaccurate bw estimations, and that was why
> > I introduced switchover-bandwidth parameter, I wished after that the result
> > can be closer to downtime_limit but we never tried to test again.  I am not
> > sure either on whether that's the best way to address this.
> > 
> > But let's just ignore the iterable save() huge delays (which can be
> > explained, and hopefully will still be covered by downtime_limit
> > calculations when it can try to get closer to right), and we can also see
> > at least a few things we didn't account:
> > 
> >   - stop vm: 268ms
> >   - non-iterables: 270ms
> >   - dest load until complete: 144ms
> > 
> > For the last one, we did see another outlier where it can only be seen from
> > dest:
> > 
> > Non-iterable device analysis:
> > 
> >   Device LOAD of                              kvm-tpr-opt:  0 took     123976 (us)  <----- this one
> >   Device LOAD of              0000:00:02.0/pcie-root-port:  0 took       6362 (us)
> >   Device LOAD of             0000:00:02.0:00.0/virtio-net:  0 took       4583 (us)
> >   Device LOAD of             0000:00:02.0:00.1/virtio-net:  0 took       4440 (us)
> >   Device LOAD of                         0000:00:01.0/vga:  0 took       3740 (us)
> >   Device LOAD of                         0000:00:00.0/mch:  0 took       3557 (us)
> >   Device LOAD of             0000:00:02.2:00.0/virtio-blk:  0 took       3530 (us)
> >   Device LOAD of                   0000:00:02.1:00.0/xhci:  0 took       2712 (us)
> >   Device LOAD of              0000:00:02.1/pcie-root-port:  0 took       2046 (us)
> >   Device LOAD of              0000:00:02.2/pcie-root-port:  0 took       1890 (us)
> > 
> > So we found either cpu save() taking 100+ms, or kvm-tpr-opt load() taking
> > 100+ms.  None of them sounds normal, and I didn't look into them.
> >
> 
> 
> 
> > Now with a global ratio perhaps start to reflect "how much ratio of
> > downtime_limit should we account into data transfer", then we'll also need
> > to answer how the user should set that ratio value, and maybe there's a
> > sane way to calculate that by the VM setup?  I'm not sure, but those
> > questions may need to be answered together in the next post, so that such
> > parameter can be consumable.
> 
> > 
> > The answer doesn't need to be accurate, but I hope that can be based on
> > some similar analysis like above (where I didn't do it well; as I don't
> > think I looked into any of the issues, and maybe they're fix-able).  But
> > just to show what I meant.  It'll be also great when doing the analysis we
> > found issues fix-able, then it'll be great we fix the issues intead.
> > That's the part when I mentioned "I still prefer fixing downtime_limit
> > itself".
> > 
> > Thanks,
> > 
> > -- 
> > Peter Xu
> > 
>
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index 5cc304d2db..64d7290997 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -240,6 +240,8 @@  void migration_object_init(void)
     current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
 
     current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
+    /* Downtime will start when source sends its current downtime. */
+    current_incoming->downtime_start = 0;
 
     migration_object_check(current_migration, &error_fatal);
 
diff --git a/migration/migration.h b/migration/migration.h
index aa56b70795..06f4ebe214 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -230,6 +230,21 @@  struct MigrationIncomingState {
 
     /* Do exit on incoming migration failure */
     bool exit_on_error;
+
+    /* Initial downtime on destination set by MIG_CMD_SEND_SRC_DOWNTIME */
+    uint64_t downtime_start;
+    /*
+     * Current donwtime on destination that initially set equal to source by
+     * MIG_CMD_SEND_SRC_DOWNTIME, then updated by destination itself.
+     */
+    uint64_t downtime_now;
+    /*
+     * Abort live migration on destination when current destination downtime
+     * exceeds the abort_limit. abort_limit is being set by
+     * MIG_CMD_SEND_SRC_DOWNTIME sent from source.
+     */
+    uint64_t abort_limit;
+    uint64_t src_downtime;
 };
 
 MigrationIncomingState *migration_incoming_get_current(void);
diff --git a/migration/savevm.c b/migration/savevm.c
index 031ab03915..f3b5ea98bf 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -90,6 +90,7 @@  enum qemu_vm_cmd {
     MIG_CMD_ENABLE_COLO,       /* Enable COLO */
     MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
     MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
+    MIG_CMD_SEND_SRC_DOWNTIME,    /* Send current downtime to dst */
     MIG_CMD_MAX
 };
 
@@ -109,6 +110,7 @@  static struct mig_cmd_args {
     [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
     [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
     [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
+    [MIG_CMD_SEND_SRC_DOWNTIME] = { .len = -1, .name = "SEND_SRC_DOWNTIME" },
     [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 };
 
@@ -1218,6 +1220,18 @@  void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
     qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
 }
 
+void qemu_savevm_send_downtime(QEMUFile *f, int64_t abort_limit_ms,
+                               int64_t source_downtime)
+{
+    uint64_t tmp[2];
+    tmp[0] = cpu_to_be64(abort_limit_ms);
+    tmp[1] = cpu_to_be64(source_downtime);
+
+    trace_qemu_savevm_send_downtime(abort_limit_ms, source_downtime);
+    qemu_savevm_command_send(f, MIG_CMD_SEND_SRC_DOWNTIME,
+                             16, (uint8_t *)tmp);
+}
+
 bool qemu_savevm_state_blocked(Error **errp)
 {
     SaveStateEntry *se;
@@ -1635,6 +1649,14 @@  int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
         }
     }
 
+    if (migrate_switchover_abort()) {
+        MigrationState *s = migrate_get_current();
+        uint64_t abort_limit_ms =
+            s->parameters.downtime_limit + s->parameters.switchover_limit;
+        qemu_savevm_send_downtime(f, abort_limit_ms,
+                                  migration_get_current_downtime(s));
+    }
+
     if (iterable_only) {
         goto flush;
     }
@@ -1919,6 +1941,20 @@  static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
     return 0;
 }
 
+static int loadvm_handle_src_downtime(MigrationIncomingState *mis,
+                                      uint16_t len)
+{
+    uint64_t src_abort_limit = qemu_get_be64(mis->from_src_file);
+    uint64_t src_current_downtime = qemu_get_be64(mis->from_src_file);
+
+    mis->abort_limit = src_abort_limit;
+    mis->src_downtime = src_current_downtime;
+    mis->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+    trace_loadvm_handle_src_downtime(src_abort_limit, src_current_downtime);
+    return 0;
+}
+
 /* After postcopy we will be told to throw some pages away since they're
  * dirty and will have to be demand fetched.  Must happen before CPU is
  * started.
@@ -2540,6 +2576,9 @@  static int loadvm_process_command(QEMUFile *f)
 
     case MIG_CMD_ENABLE_COLO:
         return loadvm_process_enable_colo(mis);
+
+    case MIG_CMD_SEND_SRC_DOWNTIME:
+        return loadvm_handle_src_downtime(mis, len);
     }
 
     return 0;
@@ -2659,6 +2698,18 @@  qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,
         trace_vmstate_downtime_load("non-iterable", se->idstr,
                                     se->instance_id, end_ts - start_ts);
     }
+    if (migrate_switchover_abort() && type == QEMU_VM_SECTION_FULL &&
+        mis->downtime_start) {
+        mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+        uint64_t dst_downtime = mis->downtime_now - mis->downtime_start;
+        if (mis->src_downtime + dst_downtime >= mis->abort_limit) {
+            error_report("Shutdown destination migration, migration abort_limit"
+                         "(%lu ms) was reached.", mis->abort_limit);
+            trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime,
+                                             mis->src_downtime);
+            return -EINVAL;
+        }
+    }
 
     if (!check_section_footer(f, se)) {
         return -EINVAL;
@@ -2714,6 +2765,19 @@  qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,
                                     se->instance_id, end_ts - start_ts);
     }
 
+    if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END &&
+        mis->downtime_start) {
+        mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+        uint64_t dst_downtime = mis->downtime_now - mis->downtime_start;
+        if (mis->src_downtime + dst_downtime >= mis->abort_limit) {
+            error_report("Shutdown destination migration, migration abort_limit (%lu ms)"
+                          "was reached.", mis->abort_limit);
+            trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime,
+                                             mis->src_downtime);
+            return -EINVAL;
+        }
+    }
+
     if (!check_section_footer(f, se)) {
         return -EINVAL;
     }
@@ -2901,6 +2965,10 @@  retry:
         }
 
         trace_qemu_loadvm_state_section(section_type);
+        /* Start destination timer after we receive the downtime from source. */
+        if (mis->downtime_start) {
+            mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+        }
         switch (section_type) {
         case QEMU_VM_SECTION_START:
         case QEMU_VM_SECTION_FULL:
diff --git a/migration/savevm.h b/migration/savevm.h
index 9ec96a995c..1166c341f3 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -53,6 +53,8 @@  void qemu_savevm_send_postcopy_listen(QEMUFile *f);
 void qemu_savevm_send_postcopy_run(QEMUFile *f);
 void qemu_savevm_send_postcopy_resume(QEMUFile *f);
 void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name);
+void qemu_savevm_send_downtime(QEMUFile *f, int64_t switchover_abort_ms,
+                               int64_t source_downtime_ms);
 
 void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
                                            uint16_t len,
diff --git a/migration/trace-events b/migration/trace-events
index 0b7c3324fb..977988848a 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -54,6 +54,9 @@  postcopy_pause_incoming(void) ""
 postcopy_pause_incoming_continued(void) ""
 postcopy_page_req_sync(void *host_addr) "sync page req %p"
 
+qemu_savevm_send_downtime(uint64_t src_abort_limit_ms, uint64_t source_downtime) "switchover abort limit=%"PRIi64", source downtime=%"PRIi64
+loadvm_handle_src_downtime(uint64_t src_abort_limit_ms, uint64_t source_downtime) "switchover abort limit=%"PRIi64", source downtime=%"PRIi64
+qemu_loadvm_downtime_abort(uint64_t abort_limit_ms, uint64_t dst_downtime, uint64_t source_downtime) "switchover abort limit=%"PRIi64", destination downtime=%"PRIi64", source downtime=%"PRIi64
 # vmstate.c
 vmstate_load_field_error(const char *field, int ret) "field \"%s\" load failed, ret = %d"
 vmstate_load_state(const char *name, int version_id) "%s v%d"