[v2,4/4] Try not to exceed max downtime on stage3

Message ID	1264087444-14193-5-git-send-email-lirans@il.ibm.com
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> From: Liran Schour <lirans@il.ibm.com> To: qemu-devel@nongnu.org Date: Thu, 21 Jan 2010 17:24:04 +0200 Message-Id: <1264087444-14193-5-git-send-email-lirans@il.ibm.com> In-Reply-To: <1264087444-14193-4-git-send-email-lirans@il.ibm.com> References: <1264087444-14193-1-git-send-email-lirans@il.ibm.com> <1264087444-14193-2-git-send-email-lirans@il.ibm.com> <1264087444-14193-3-git-send-email-lirans@il.ibm.com> <1264087444-14193-4-git-send-email-lirans@il.ibm.com> Cc: Liran Schour <lirans@il.ibm.com> Subject: [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3 Precedence: list Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

Message ID

1264087444-14193-5-git-send-email-lirans@il.ibm.com

State

New

Headers

From: Liran Schour <lirans@il.ibm.com>
To: qemu-devel@nongnu.org
Date: Thu, 21 Jan 2010 17:24:04 +0200
Message-Id: <1264087444-14193-5-git-send-email-lirans@il.ibm.com>
In-Reply-To: <1264087444-14193-4-git-send-email-lirans@il.ibm.com>
References: <1264087444-14193-1-git-send-email-lirans@il.ibm.com>
	<1264087444-14193-2-git-send-email-lirans@il.ibm.com>
	<1264087444-14193-3-git-send-email-lirans@il.ibm.com>
	<1264087444-14193-4-git-send-email-lirans@il.ibm.com>
Cc: Liran Schour <lirans@il.ibm.com>
Subject: [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3
Precedence: list
Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

Commit Message

Liran Schour Jan. 21, 2010, 3:24 p.m. UTC

Move to stage3 only when remaining work can be done below max downtime.

Changes from v1: remove max iterations. Try to infer storage performance and by that calculate remaining work.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |  136 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 132 insertions(+), 4 deletions(-)

Comments

Pierre Riteau Jan. 21, 2010, 6:03 p.m. UTC | #1

On 21 janv. 2010, at 16:24, Liran Schour wrote:

> Move to stage3 only when remaining work can be done below max downtime.
> 
> Changes from v1: remove max iterations. Try to infer storage performance and by that calculate remaining work.
> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
> block-migration.c |  136 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 files changed, 132 insertions(+), 4 deletions(-)
> 
> diff --git a/block-migration.c b/block-migration.c
> index 16df75f..5ef3eb8 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -17,6 +17,7 @@
> #include "qemu-queue.h"
> #include "monitor.h"
> #include "block-migration.h"
> +#include "migration.h"
> #include <assert.h>
> 
> #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> @@ -60,6 +61,7 @@ typedef struct BlkMigBlock {
>     QEMUIOVector qiov;
>     BlockDriverAIOCB *aiocb;
>     int ret;
> +    long double time;
>     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
> } BlkMigBlock;
> 
> @@ -74,11 +76,79 @@ typedef struct BlkMigState {
>     int64_t total_sector_sum;
>     int prev_progress;
>     int bulk_completed;
> -    int dirty_iterations;
> +    long double total_time;
> +    int reads;
> } BlkMigState;
> 
> static BlkMigState block_mig_state;
> 
> +static int64_t get_clock_realtime(void)
> +{
> +    struct timeval tv;
> +
> +    gettimeofday(&tv, NULL);
> +    return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000);
> +}
> +
> +#ifdef WIN32
> +
> +static int64_t clock_freq;
> +
> +static void init_get_clock(void)
> +{
> +    LARGE_INTEGER freq;
> +    int ret;
> +    ret = QueryPerformanceFrequency(&freq);
> +    if (ret == 0) {
> +        fprintf(stderr, "Could not calibrate ticks\n");
> +        exit(1);
> +    }
> +    clock_freq = freq.QuadPart;
> +}
> +
> +static int64_t get_clock(void)
> +{
> +    LARGE_INTEGER ti;
> +    QueryPerformanceCounter(&ti);
> +    return muldiv64(ti.QuadPart, get_ticks_per_sec(), clock_freq);
> +}
> +
> +#else
> +
> +static int use_rt_clock;
> +
> +static void init_get_clock(void)
> +{
> +    use_rt_clock = 0;
> +#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \
> +    || defined(__DragonFly__) || defined(__FreeBSD_kernel__)
> +    {
> +        struct timespec ts;
> +        if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
> +            use_rt_clock = 1;
> +        }
> +    }
> +#endif
> +}
> +
> +static int64_t get_clock(void)
> +{
> +#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \
> +	|| defined(__DragonFly__) || defined(__FreeBSD_kernel__)
> +    if (use_rt_clock) {
> +        struct timespec ts;
> +        clock_gettime(CLOCK_MONOTONIC, &ts);
> +        return ts.tv_sec * 1000000000LL + ts.tv_nsec;
> +    } else
> +#endif
> +    {
> +        /* XXX: using gettimeofday leads to problems if the date
> +           changes, so it should be avoided. */
> +        return get_clock_realtime();
> +    }
> +}
> +#endif
> +
> static void blk_send(QEMUFile *f, BlkMigBlock * blk)
> {
>     int len;
> @@ -127,12 +197,28 @@ uint64_t blk_mig_bytes_total(void)
>     return sum << BDRV_SECTOR_BITS;
> }
> 
> +static inline void add_avg_read_time(long double time)
> +{
> +    block_mig_state.reads++;
> +    block_mig_state.total_time += time;
> +}
> +
> +static inline long double compute_read_bwidth(void)
> +{
> +    assert(block_mig_state.total_time != 0);
> +    return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
> +}
> +
> static void blk_mig_read_cb(void *opaque, int ret)
> {
>     BlkMigBlock *blk = opaque;
> 
>     blk->ret = ret;
> 
> +    blk->time = get_clock() - blk->time;
> +
> +    add_avg_read_time(blk->time);
> +
>     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
> 
>     block_mig_state.submitted--;
> @@ -182,6 +268,8 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
>     blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
>     qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
> 
> +    blk->time = get_clock();
> +
>     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
>                                 nr_sectors, blk_mig_read_cb, blk);
>     if (!blk->aiocb) {
> @@ -223,6 +311,8 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
>     block_mig_state.total_sector_sum = 0;
>     block_mig_state.prev_progress = -1;
>     block_mig_state.bulk_completed = 0;
> +    block_mig_state.total_time = 0;
> +    block_mig_state.reads = 0;
> 
>     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
>         if (bs->type == BDRV_TYPE_HD) {
> @@ -321,6 +411,8 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
>                 blk->iov.iov_base = blk->buf;
>                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
>                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
> +
> +		blk->time = get_clock();
> 
>                 blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
>                                             nr_sectors, blk_mig_read_cb, blk);
> @@ -403,10 +495,42 @@ static void flush_blks(QEMUFile* f)
>             block_mig_state.transferred);
> }
> 
> +static int64_t get_remaining_dirty(void)
> +{
> +    BlkMigDevState *bmds;
> +    int64_t dirty = 0;
> +
> +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
> +        dirty += bdrv_get_dirty_count(bmds->bs);
> +    }
> +
> +    return dirty * BLOCK_SIZE;
> +}
> +
> static int is_stage2_completed(void)
> {
> -    return (block_mig_state.submitted == 0 &&
> -	    block_mig_state.bulk_completed);
> +    int64_t remaining_dirty;
> +    long double bwidth;
> +
> +    if (block_mig_state.bulk_completed == 1) {
> +
> +        remaining_dirty = get_remaining_dirty();
> +	if(remaining_dirty == 0) {
> +	    return 1;
> +	}
> +
> +	bwidth = compute_read_bwidth();
> +
> +	if ((remaining_dirty / bwidth) <=
> +            migrate_max_downtime()) {
> +            /* finish stage2 because we think that we can finish remaing work
> +               below max_downtime */
> +
> +            return 1;
> +        }
> +    }
> +
> +    return 0;
> }
> 
> static void blk_mig_cleanup(Monitor *mon)
> @@ -490,7 +614,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>     }
> 
>     if (stage == 3) {
> -        /* we now for sure that save bulk is completed */
> +        /* we know for sure that save bulk is completed and
> +           all async read completed */
> +        assert(block_mig_state.submitted == 0);
> 
>         while(blk_mig_save_dirty_block(mon, f, 0) != 0);
>         blk_mig_cleanup(mon);
> @@ -580,4 +706,6 @@ void blk_mig_init(void)
> 
>     register_savevm_live("block", 0, 1, block_set_params, block_save_live,
>                          NULL, block_load, &block_mig_state);
> +
> +    init_get_clock();
> }
> -- 
> 1.6.0.4
> 
> 
> 


I haven't read the patch in detail but I think we should be able to avoid duplicating code from vl.c by using qemu_get_clock.
Also, is floating point really necessary?

Liran Schour Jan. 25, 2010, 8:57 a.m. UTC | #2

Pierre Riteau <Pierre.Riteau@irisa.fr> wrote on 21/01/2010 20:03:32:

> On 21 janv. 2010, at 16:24, Liran Schour wrote:
>
> > Move to stage3 only when remaining work can be done below max downtime.
> >
> > Changes from v1: remove max iterations. Try to infer storage
> performance and by that calculate remaining work.
...
>
> I haven't read the patch in detail but I think we should be able to
> avoid duplicating code from vl.c by using qemu_get_clock.
> Also, is floating point really necessary?

I thought that qemu_get_clock will return a value in 1000HZ (and that is
too low resolution). But now I see that I can use qemu_get_clock
(host_clock) and get nanoseconds resolution. I will switch it to
qemu_get_clock(host_clock) to avoid duplicating of code.
And I think we can avoid floating point here.

Thanks for the review,
- Liran

Pierre Riteau Jan. 25, 2010, 9:16 a.m. UTC | #3

On 25 janv. 2010, at 09:57, Liran Schour wrote:

> 
> 
> Pierre Riteau <Pierre.Riteau@irisa.fr> wrote on 21/01/2010 20:03:32:
> 
>> On 21 janv. 2010, at 16:24, Liran Schour wrote:
>> 
>>> Move to stage3 only when remaining work can be done below max downtime.
>>> 
>>> Changes from v1: remove max iterations. Try to infer storage
>> performance and by that calculate remaining work.
> ...
>> 
>> I haven't read the patch in detail but I think we should be able to
>> avoid duplicating code from vl.c by using qemu_get_clock.
>> Also, is floating point really necessary?
> 
> I thought that qemu_get_clock will return a value in 1000HZ (and that is
> too low resolution). But now I see that I can use qemu_get_clock
> (host_clock) and get nanoseconds resolution. I will switch it to
> qemu_get_clock(host_clock) to avoid duplicating of code.
> And I think we can avoid floating point here.
> 
> Thanks for the review,
> - Liran
> 

You probably don't want to use qemu_get_clock(host_clock): it calls get_clock_realtime(), which uses gettimeofday(). If the clock is modified by NTP, you could get wrong values.
Instead, you could simply introduce code to get the value you want in nanoseconds. Paolo Bonzini has a patch for this in his tree:
http://github.com/bonzini/qemu/commit/cbff458ad6a021582bfddb0f11c4628bbb2cd1e5

diff --git a/block-migration.c b/block-migration.c
index 16df75f..5ef3eb8 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -17,6 +17,7 @@ 
 #include "qemu-queue.h"
 #include "monitor.h"
 #include "block-migration.h"
+#include "migration.h"
 #include <assert.h>
 
 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
@@ -60,6 +61,7 @@  typedef struct BlkMigBlock {
     QEMUIOVector qiov;
     BlockDriverAIOCB *aiocb;
     int ret;
+    long double time;
     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
 } BlkMigBlock;
 
@@ -74,11 +76,79 @@  typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
-    int dirty_iterations;
+    long double total_time;
+    int reads;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
 
+static int64_t get_clock_realtime(void)
+{
+    struct timeval tv;
+
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000);
+}
+
+#ifdef WIN32
+
+static int64_t clock_freq;
+
+static void init_get_clock(void)
+{
+    LARGE_INTEGER freq;
+    int ret;
+    ret = QueryPerformanceFrequency(&freq);
+    if (ret == 0) {
+        fprintf(stderr, "Could not calibrate ticks\n");
+        exit(1);
+    }
+    clock_freq = freq.QuadPart;
+}
+
+static int64_t get_clock(void)
+{
+    LARGE_INTEGER ti;
+    QueryPerformanceCounter(&ti);
+    return muldiv64(ti.QuadPart, get_ticks_per_sec(), clock_freq);
+}
+
+#else
+
+static int use_rt_clock;
+
+static void init_get_clock(void)
+{
+    use_rt_clock = 0;
+#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \
+    || defined(__DragonFly__) || defined(__FreeBSD_kernel__)
+    {
+        struct timespec ts;
+        if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
+            use_rt_clock = 1;
+        }
+    }
+#endif
+}
+
+static int64_t get_clock(void)
+{
+#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \
+	|| defined(__DragonFly__) || defined(__FreeBSD_kernel__)
+    if (use_rt_clock) {
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        return ts.tv_sec * 1000000000LL + ts.tv_nsec;
+    } else
+#endif
+    {
+        /* XXX: using gettimeofday leads to problems if the date
+           changes, so it should be avoided. */
+        return get_clock_realtime();
+    }
+}
+#endif
+
 static void blk_send(QEMUFile *f, BlkMigBlock * blk)
 {
     int len;
@@ -127,12 +197,28 @@  uint64_t blk_mig_bytes_total(void)
     return sum << BDRV_SECTOR_BITS;
 }
 
+static inline void add_avg_read_time(long double time)
+{
+    block_mig_state.reads++;
+    block_mig_state.total_time += time;
+}
+
+static inline long double compute_read_bwidth(void)
+{
+    assert(block_mig_state.total_time != 0);
+    return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
+}
+
 static void blk_mig_read_cb(void *opaque, int ret)
 {
     BlkMigBlock *blk = opaque;
 
     blk->ret = ret;
 
+    blk->time = get_clock() - blk->time;
+
+    add_avg_read_time(blk->time);
+
     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 
     block_mig_state.submitted--;
@@ -182,6 +268,8 @@  static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
     blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
     qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
+    blk->time = get_clock();
+
     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                 nr_sectors, blk_mig_read_cb, blk);
     if (!blk->aiocb) {
@@ -223,6 +311,8 @@  static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
+    block_mig_state.total_time = 0;
+    block_mig_state.reads = 0;
 
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
@@ -321,6 +411,8 @@  static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
                 blk->iov.iov_base = blk->buf;
                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+		blk->time = get_clock();
 
                 blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
                                             nr_sectors, blk_mig_read_cb, blk);
@@ -403,10 +495,42 @@  static void flush_blks(QEMUFile* f)
             block_mig_state.transferred);
 }
 
+static int64_t get_remaining_dirty(void)
+{
+    BlkMigDevState *bmds;
+    int64_t dirty = 0;
+
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        dirty += bdrv_get_dirty_count(bmds->bs);
+    }
+
+    return dirty * BLOCK_SIZE;
+}
+
 static int is_stage2_completed(void)
 {
-    return (block_mig_state.submitted == 0 &&
-	    block_mig_state.bulk_completed);
+    int64_t remaining_dirty;
+    long double bwidth;
+
+    if (block_mig_state.bulk_completed == 1) {
+
+        remaining_dirty = get_remaining_dirty();
+	if(remaining_dirty == 0) {
+	    return 1;
+	}
+
+	bwidth = compute_read_bwidth();
+
+	if ((remaining_dirty / bwidth) <=
+            migrate_max_downtime()) {
+            /* finish stage2 because we think that we can finish remaing work
+               below max_downtime */
+
+            return 1;
+        }
+    }
+
+    return 0;
 }
 
 static void blk_mig_cleanup(Monitor *mon)
@@ -490,7 +614,9 @@  static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
 
     if (stage == 3) {
-        /* we now for sure that save bulk is completed */
+        /* we know for sure that save bulk is completed and
+           all async read completed */
+        assert(block_mig_state.submitted == 0);
 
         while(blk_mig_save_dirty_block(mon, f, 0) != 0);
         blk_mig_cleanup(mon);
@@ -580,4 +706,6 @@  void blk_mig_init(void)
 
     register_savevm_live("block", 0, 1, block_set_params, block_save_live,
                          NULL, block_load, &block_mig_state);
+
+    init_get_clock();
 }

[v2,4/4] Try not to exceed max downtime on stage3

Commit Message

Comments

Patch