Improve accuracy of block migration bandwidth calculation

Message ID	OF532A7D77.B17230EE-ONC2257864.004CFAD7-C2257864.004E2CA3@il.ibm.com
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> To: qemu-devel@nongnu.org Message-ID: <OF532A7D77.B17230EE-ONC2257864.004CFAD7-C2257864.004E2CA3@il.ibm.com> From: Avishay Traeger <AVISHAY@il.ibm.com> Date: Thu, 31 Mar 2011 16:13:59 +0200 MIME-Version: 1.0 Content-type: text/plain; charset=US-ASCII Cc: Liran Schour <LIRANS@il.ibm.com> Subject: [Qemu-devel] [PATCH] Improve accuracy of block migration bandwidth calculation Precedence: list Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

Message ID

OF532A7D77.B17230EE-ONC2257864.004CFAD7-C2257864.004E2CA3@il.ibm.com

State

New

Headers

To: qemu-devel@nongnu.org
Message-ID: <OF532A7D77.B17230EE-ONC2257864.004CFAD7-C2257864.004E2CA3@il.ibm.com>
From: Avishay Traeger <AVISHAY@il.ibm.com>
Date: Thu, 31 Mar 2011 16:13:59 +0200
MIME-Version: 1.0
Content-type: text/plain; charset=US-ASCII
Cc: Liran Schour <LIRANS@il.ibm.com>
Subject: [Qemu-devel] [PATCH] Improve accuracy of block migration bandwidth
	calculation
Precedence: list
Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

Commit Message

Avishay Traeger1 March 31, 2011, 2:13 p.m. UTC

The current bandwidth calculation looks like this:
(block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time

"total_time" is currently the sum of the read request latencies.  This is
not very accurate because block migration uses aio and so several requests
can be submitted at once.  Bandwidth should be computed with wall-clock
time, not by adding the latencies.  In this case, "total_time" has a higher
value than it should, and so the computed bandwidth is lower than it is in
reality.  This means that migration can take longer than it needs to.

However, we don't want to use pure wall-clock time here.  We are computing
bandwidth in the asynchronous phase, where the migration repeatedly wakes
up and sends some aio requests.  The computed bandwidth will be used for
synchronous transfer.

So my solution is to use the total wall-clock time when I/Os are actually
in flight.

I hope my explanation was clear :)

Signed-off-by: Avishay Traeger <avishay@il.ibm.com>
---
 block-migration.c |   24 +++++++++++++-----------
 1 files changed, 13 insertions(+), 11 deletions(-)

     bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
@@ -250,7 +248,9 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile
*f,
     blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
     qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);

-    blk->time = qemu_get_clock_ns(rt_clock);
+    /* If there are no outstanding requests, start an interval */
+    if (block_mig_state.submitted == 0)
+	block_mig_state.interval_start_time = qemu_get_clock_ns(rt_clock);

     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                 nr_sectors, blk_mig_read_cb, blk);
@@ -409,7 +409,9 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile
*f,
                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);

-                blk->time = qemu_get_clock_ns(rt_clock);
+		/* If there are no outstanding requests, start an interval */
+		if (block_mig_state.submitted == 0)
+		    block_mig_state.interval_start_time = qemu_get_clock_ns
(rt_clock);

                 blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
                                             nr_sectors, blk_mig_read_cb,
blk);
--
1.7.0.4

Comments

Michael Roth March 31, 2011, 8:57 p.m. UTC | #1

On 03/31/2011 09:13 AM, Avishay Traeger wrote:
>
> The current bandwidth calculation looks like this:
> (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time
>
> "total_time" is currently the sum of the read request latencies.  This is
> not very accurate because block migration uses aio and so several requests
> can be submitted at once.  Bandwidth should be computed with wall-clock
> time, not by adding the latencies.  In this case, "total_time" has a higher
> value than it should, and so the computed bandwidth is lower than it is in
> reality.  This means that migration can take longer than it needs to.
>
> However, we don't want to use pure wall-clock time here.  We are computing
> bandwidth in the asynchronous phase, where the migration repeatedly wakes
> up and sends some aio requests.  The computed bandwidth will be used for
> synchronous transfer.
>
> So my solution is to use the total wall-clock time when I/Os are actually
> in flight.
>
> I hope my explanation was clear :)
>
> Signed-off-by: Avishay Traeger<avishay@il.ibm.com>
> ---
>   block-migration.c |   24 +++++++++++++-----------
>   1 files changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/block-migration.c b/block-migration.c
> index 8218bac..833d25a 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -62,7 +62,6 @@ typedef struct BlkMigBlock {
>       QEMUIOVector qiov;
>       BlockDriverAIOCB *aiocb;
>       int ret;
> -    int64_t time;
>       QSIMPLEQ_ENTRY(BlkMigBlock) entry;
>   } BlkMigBlock;
>
> @@ -77,6 +76,7 @@ typedef struct BlkMigState {
>       int64_t total_sector_sum;
>       int prev_progress;
>       int bulk_completed;
> +    int64_t interval_start_time;
>       long double total_time;
>       int reads;
>   } BlkMigState;
> @@ -131,12 +131,6 @@ uint64_t blk_mig_bytes_total(void)
>       return sum<<  BDRV_SECTOR_BITS;
>   }
>
> -static inline void add_avg_read_time(int64_t time)
> -{
> -    block_mig_state.reads++;
> -    block_mig_state.total_time += time;
> -}
> -
>   static inline long double compute_read_bwidth(void)
>   {
>       assert(block_mig_state.total_time != 0);
> @@ -195,9 +189,13 @@ static void blk_mig_read_cb(void *opaque, int ret)
>
>       blk->ret = ret;
>
> -    blk->time = qemu_get_clock_ns(rt_clock) - blk->time;
> +    /* If this is the last outstanding callback, we end the interval */
> +    if (block_mig_state.submitted == 1) {
> +	block_mig_state.total_time +=
> +	    (qemu_get_clock_ns(rt_clock) -
> block_mig_state.interval_start_time);
> +    }

The only thing I think the bwidth calculation is used for here is to 
predict whether we can move on to the next stage of block migration and 
reasonably rely on any outstanding AIO to complete before the migration 
deadline.

Currently we get an update on that value with every cb... with these 
patches, in the worst case, we get 1 update after the last dirty block 
AIO is completed, since I think there are pathological cases where 
block_mig_state.submitted might get somewhere up beyond 1 and never get 
down to 0 until we're done with block migration.

So there's 2 potential issues:

1) We're more likely to hit this corner case where 
block_mig_state.bulk_completed == 1 and we haven't yet completed a 
bwidth calculation interval (so block_mig_state.total_time is still 0):

static inline long double compute_read_bwidth(void)
{
     assert(block_mig_state.total_time != 0);
     return  (block_mig_state.reads * BLOCK_SIZE)/ 
block_mig_state.total_time;
}

2) If we change that to return a conservative value instead of aborting 
(probably a good idea), we run the risk of actually slowing down 
migration by spending to much time waiting for a bwidth value that says 
we can move on and let outstanding AIO complete in the background. If we 
fix it to return an aggressive value we stand a higher risk of 
overshooting the migration deadline.

I think a better solution would be to keep incrementing by the delta 
since the previous cb was made. This would yield a more consistent 
bwidth value throughout:

mig_save_device_dirty/mig_save_device_bulk:

if (bms.submitted == 0):
     bms.previous_time_offset = current_time()

blk_mig_read_cb:

bms.total_time += current_time() - bms.previous_time_offset
bms.previous_time_offset = current_time()

Any idle periods where there's no AIO (bms.submitted == 0) would still 
get ignored.

>
> -    add_avg_read_time(blk->time);
> +    block_mig_state.reads++;
>
>       QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
>       bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
> @@ -250,7 +248,9 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile
> *f,
>       blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
>       qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
>
> -    blk->time = qemu_get_clock_ns(rt_clock);
> +    /* If there are no outstanding requests, start an interval */
> +    if (block_mig_state.submitted == 0)
> +	block_mig_state.interval_start_time = qemu_get_clock_ns(rt_clock);
>
>       blk->aiocb = bdrv_aio_readv(bs, cur_sector,&blk->qiov,
>                                   nr_sectors, blk_mig_read_cb, blk);
> @@ -409,7 +409,9 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile
> *f,
>                   blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
>                   qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
>
> -                blk->time = qemu_get_clock_ns(rt_clock);
> +		/* If there are no outstanding requests, start an interval */
> +		if (block_mig_state.submitted == 0)
> +		    block_mig_state.interval_start_time = qemu_get_clock_ns
> (rt_clock);
>
>                   blk->aiocb = bdrv_aio_readv(bmds->bs, sector,&blk->qiov,
>                                               nr_sectors, blk_mig_read_cb,
> blk);
> --
> 1.7.0.4
>
>

diff --git a/block-migration.c b/block-migration.c
index 8218bac..833d25a 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -62,7 +62,6 @@  typedef struct BlkMigBlock {
     QEMUIOVector qiov;
     BlockDriverAIOCB *aiocb;
     int ret;
-    int64_t time;
     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
 } BlkMigBlock;

@@ -77,6 +76,7 @@  typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
+    int64_t interval_start_time;
     long double total_time;
     int reads;
 } BlkMigState;
@@ -131,12 +131,6 @@  uint64_t blk_mig_bytes_total(void)
     return sum << BDRV_SECTOR_BITS;
 }

-static inline void add_avg_read_time(int64_t time)
-{
-    block_mig_state.reads++;
-    block_mig_state.total_time += time;
-}
-
 static inline long double compute_read_bwidth(void)
 {
     assert(block_mig_state.total_time != 0);
@@ -195,9 +189,13 @@  static void blk_mig_read_cb(void *opaque, int ret)

     blk->ret = ret;

-    blk->time = qemu_get_clock_ns(rt_clock) - blk->time;
+    /* If this is the last outstanding callback, we end the interval */
+    if (block_mig_state.submitted == 1) {
+	block_mig_state.total_time +=
+	    (qemu_get_clock_ns(rt_clock) -
block_mig_state.interval_start_time);
+    }

-    add_avg_read_time(blk->time);
+    block_mig_state.reads++;

     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);

Improve accuracy of block migration bandwidth calculation

Commit Message

Comments

Patch