===================================================================
@@ -0,0 +1,892 @@
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "blockdev.h"
+#include "qemu-queue.h"
+#include "qemu-timer.h"
+#include "monitor.h"
+#include "block-copy.h"
+#include "migration.h"
+#include "sysemu.h"
+#include "qjson.h"
+#include <assert.h>
+#include "hw/hw.h"
+
+#define DIRTY_CHUNK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
+#define MAX_IS_ALLOCATED_SEARCH 65536
+
+/*
+ * Stages:
+ *
+ * STAGE_BULK: bulk reads/writes in progress
+ * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
+ * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
+ * STAGE_MIRROR_WRITES: copy finished, writes mirrored to both images.
+ * STAGE_SWITCH_FINISHED: switched to new image.
+ */
+
+enum BdrvCopyStage {
+ STAGE_BULK,
+ STAGE_BULK_FINISHED,
+ STAGE_DIRTY,
+ STAGE_MIRROR_WRITES,
+ STAGE_SWITCH_FINISHED,
+};
+
+#define NAME_LEN 1024
+#define DEV_LEN 256
+
+typedef struct BdrvCopyState {
+ BlockDriverState *src;
+ BlockDriverState *dst;
+ bool shared_base;
+
+ int64_t curr_sector;
+ int64_t completed_sectors;
+ int64_t nr_sectors;
+
+ enum BdrvCopyStage stage;
+ int inflight_reads;
+ int error;
+ int failed;
+ int cancelled;
+ bool incoming;
+ bool stopped;
+ QLIST_HEAD(, BdrvCopyBlock) io_list;
+ unsigned long *aio_bitmap;
+ QEMUTimer *aio_timer;
+ QLIST_ENTRY(BdrvCopyState) list;
+
+ VMChangeStateEntry *change_entry;
+
+ int64_t blocks;
+ int64_t total_time;
+
+ char device_name[DEV_LEN];
+ char src_filename[NAME_LEN];
+ char dst_filename[NAME_LEN];
+} BdrvCopyState;
+
+typedef struct BdrvCopyBlock {
+ BdrvCopyState *state;
+ uint8_t *buf;
+ int64_t sector;
+ int64_t nr_sectors;
+ struct iovec iov;
+ QEMUIOVector qiov;
+ BlockDriverAIOCB *aiocb;
+ int64_t time;
+ QLIST_ENTRY(BdrvCopyBlock) list;
+} BdrvCopyBlock;
+
+static QLIST_HEAD(, BdrvCopyState) block_copy_list =
+ QLIST_HEAD_INITIALIZER(block_copy_list);
+
+static void alloc_aio_bitmap(BdrvCopyState *s)
+{
+ BlockDriverState *bs = s->src;
+ int64_t bitmap_size;
+
+ bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
+ BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+ bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
+
+ s->aio_bitmap = qemu_mallocz(bitmap_size);
+}
+
+static bool aio_inflight(BdrvCopyState *s, int64_t sector)
+{
+ int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+ if (s->aio_bitmap &&
+ (sector << BDRV_SECTOR_BITS) < bdrv_getlength(s->src)) {
+ return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
+ (1UL << (chunk % (sizeof(unsigned long) * 8))));
+ } else {
+ return 0;
+ }
+}
+
+static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
+ int nb_sectors, int set)
+{
+ int64_t start, end;
+ unsigned long val, idx, bit;
+
+ start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+ end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+ for (; start <= end; start++) {
+ idx = start / (sizeof(unsigned long) * 8);
+ bit = start % (sizeof(unsigned long) * 8);
+ val = s->aio_bitmap[idx];
+ if (set) {
+ if (!(val & (1UL << bit))) {
+ val |= 1UL << bit;
+ }
+ } else {
+ if (val & (1UL << bit)) {
+ val &= ~(1UL << bit);
+ }
+ }
+ s->aio_bitmap[idx] = val;
+ }
+}
+
+static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
+{
+ s->stage = stage;
+
+ switch (stage) {
+ case STAGE_BULK:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
+ break;
+ case STAGE_BULK_FINISHED:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
+ break;
+ case STAGE_DIRTY:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
+ break;
+ case STAGE_MIRROR_WRITES:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_MIRROR_WRITES);
+ break;
+ case STAGE_SWITCH_FINISHED:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
+ break;
+ default:
+ break;
+ }
+}
+
+static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
+{
+ s->error = ret;
+ qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+}
+
+static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
+{
+ s->blocks++;
+ s->total_time += time;
+}
+
+static void blk_copy_write_cb(void *opaque, int ret)
+{
+ BdrvCopyBlock *blk = opaque;
+ BdrvCopyState *s = blk->state;
+
+ if (ret < 0) {
+ QLIST_REMOVE(blk, list);
+ qemu_vfree(blk->buf);
+ qemu_free(blk);
+ blk_copy_handle_cb_error(s, ret);
+ return;
+ }
+
+ QLIST_REMOVE(blk, list);
+ add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
+
+ /* schedule switch to STAGE_DIRTY on last bulk write completion */
+ if (blk->state->stage == STAGE_BULK_FINISHED) {
+ qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+ }
+
+ if (blk->state->stage > STAGE_BULK_FINISHED) {
+ set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
+ }
+
+ qemu_vfree(blk->buf);
+ qemu_free(blk);
+}
+
+static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
+{
+ BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+ blk->state = s;
+ blk->sector = read_blk->sector;
+ blk->nr_sectors = read_blk->nr_sectors;
+ blk->time = read_blk->time;
+ blk->buf = read_blk->buf;
+ QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+ blk->iov.iov_base = read_blk->buf;
+ blk->iov.iov_len = read_blk->iov.iov_len;
+ qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
+ blk->aiocb = bdrv_aio_writev(s->dst, blk->sector, &blk->qiov,
+ blk->iov.iov_len / BDRV_SECTOR_SIZE,
+ blk_copy_write_cb, blk);
+ if (!blk->aiocb) {
+ s->error = -ENOMEM;
+ goto error;
+ }
+
+ return;
+
+error:
+ QLIST_REMOVE(blk, list);
+ qemu_vfree(read_blk->buf);
+ qemu_free(blk);
+}
+
+static void blk_copy_read_cb(void *opaque, int ret)
+{
+ BdrvCopyBlock *blk = opaque;
+ BdrvCopyState *s = blk->state;
+
+ s->inflight_reads--;
+ if (ret < 0) {
+ QLIST_REMOVE(blk, list);
+ qemu_vfree(blk->buf);
+ qemu_free(blk);
+ blk_copy_handle_cb_error(s, ret);
+ return;
+ }
+ blk_copy_issue_write(s, blk);
+ QLIST_REMOVE(blk, list);
+ qemu_free(blk);
+ qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+}
+
+static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
+ int nr_sectors)
+{
+ BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+ blk->buf = qemu_blockalign(s->src, DIRTY_CHUNK_SIZE);
+ blk->state = s;
+ blk->sector = sector;
+ blk->nr_sectors = nr_sectors;
+ QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+ blk->iov.iov_base = blk->buf;
+ blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+ qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+ s->inflight_reads++;
+ blk->time = qemu_get_clock_ns(rt_clock);
+ blk->aiocb = bdrv_aio_readv(s->src, sector, &blk->qiov, nr_sectors,
+ blk_copy_read_cb, blk);
+ if (!blk->aiocb) {
+ s->error = -ENOMEM;
+ goto error;
+ }
+
+ return;
+
+error:
+ s->inflight_reads--;
+ QLIST_REMOVE(blk, list);
+ qemu_vfree(blk->buf);
+ qemu_free(blk);
+}
+
+static bool blkcopy_can_switch(BdrvCopyState *s)
+{
+ int64_t remaining_dirty;
+ int64_t avg_transfer_time;
+
+ remaining_dirty = bdrv_get_dirty_count(s->src);
+ if (remaining_dirty == 0 || s->blocks == 0) {
+ return true;
+ }
+
+ avg_transfer_time = s->total_time / s->blocks;
+ if ((remaining_dirty * avg_transfer_time) <= migrate_max_downtime()) {
+ return true;
+ }
+ return false;
+}
+
+static int blk_issue_reads_dirty(BdrvCopyState *s)
+{
+ int64_t sector;
+
+ for (sector = s->curr_sector; sector < s->nr_sectors;) {
+ if (bdrv_get_dirty(s->src, sector) && !aio_inflight(s, sector)) {
+ int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
+ BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+ blk_copy_issue_read(s, sector, nr_sectors);
+ bdrv_reset_dirty(s->src, sector, nr_sectors);
+ set_aio_inflight(s, sector, nr_sectors, 1);
+ break;
+ }
+
+ sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+ s->curr_sector = sector;
+ }
+
+ if (sector >= s->nr_sectors) {
+ s->curr_sector = 0;
+ }
+ return 0;
+}
+
+static int blk_issue_reads_bulk(BdrvCopyState *s)
+{
+ int nr_sectors;
+ int64_t curr_sector = s->curr_sector;
+
+ if (s->shared_base) {
+ while (curr_sector < s->nr_sectors &&
+ !bdrv_is_allocated(s->src, curr_sector,
+ MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
+ curr_sector += nr_sectors;
+ }
+ }
+
+ if (curr_sector >= s->nr_sectors) {
+ s->curr_sector = 0;
+ return 1;
+ }
+
+ curr_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
+ nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+ blk_copy_issue_read(s, s->curr_sector, nr_sectors);
+ s->curr_sector += nr_sectors;
+ s->completed_sectors = curr_sector;
+ return 0;
+}
+
+static void blkcopy_finish(BdrvCopyState *s)
+{
+ int64_t sector;
+ uint8_t *buf;
+
+ buf = qemu_blockalign(s->src, DIRTY_CHUNK_SIZE);
+
+ /* FIXME: speed up loop, get_next_dirty_block? */
+ for (sector = 0; sector < s->nr_sectors;
+ sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
+ if (bdrv_get_dirty(s->src, sector)) {
+ int err;
+ int nr_sectors = MIN(s->nr_sectors - sector,
+ BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+ memset(buf, 0, DIRTY_CHUNK_SIZE);
+ err = bdrv_read(s->src, sector, buf, nr_sectors);
+ if (err < 0) {
+ goto error;
+ }
+ err = bdrv_write(s->dst, sector, buf, nr_sectors);
+ if (err < 0) {
+ goto error;
+ }
+ bdrv_reset_dirty(s->src, sector, nr_sectors);
+ }
+
+ if (bdrv_get_dirty_count(s->src) == 0) {
+ break;
+ }
+ }
+ qemu_vfree(buf);
+ return;
+
+error:
+ qemu_vfree(buf);
+ s->error = 1;
+}
+
+static void blkcopy_cleanup(BdrvCopyState *s)
+{
+ assert(s->inflight_reads == 0);
+ assert(QLIST_EMPTY(&s->io_list));
+ unregister_savevm(NULL, "block_copy", s);
+ qemu_del_vm_change_state_handler(s->change_entry);
+ bdrv_set_dirty_tracking(s->src, 0);
+ drive_put_ref(drive_get_by_blockdev(s->src));
+ bdrv_set_in_use(s->src, 0);
+ if (s->cancelled || s->error) {
+ bdrv_delete(s->dst);
+ }
+ if (s->stage >= STAGE_DIRTY) {
+ qemu_free(s->aio_bitmap);
+ }
+ qemu_del_timer(s->aio_timer);
+ qemu_free_timer(s->aio_timer);
+}
+
+static void blkcopy_free(BdrvCopyState *s)
+{
+ QLIST_REMOVE(s, list);
+ qemu_free(s);
+}
+
+static void handle_error(BdrvCopyState *s)
+{
+ if (!QLIST_EMPTY(&s->io_list)) {
+ return;
+ }
+ s->failed = 1;
+ blkcopy_cleanup(s);
+}
+
+static void blkcopy_switch(BdrvCopyState *s)
+{
+ char src_filename[NAME_LEN], mirror_name[NAME_LEN*2];
+ int open_flags, ret;
+
+ strncpy(src_filename, s->src->filename, sizeof(src_filename) - 1);
+ open_flags = s->src->open_flags;
+
+ assert(s->stage == STAGE_DIRTY);
+
+ /* flush any guest writes, dirty bitmap uptodate after this.
+ * copy AIO also finished.
+ */
+ qemu_aio_flush();
+ assert(QLIST_EMPTY(&s->io_list));
+ if (s->error) {
+ handle_error(s);
+ return;
+ }
+ blkcopy_finish(s);
+ if (s->error) {
+ handle_error(s);
+ return;
+ }
+ assert(bdrv_get_dirty_count(s->src) == 0);
+ /* turn dirty bitmap off */
+ bdrv_set_dirty_tracking(s->src, 0);
+ /* switch to double writes */
+ bdrv_flush_all();
+ bdrv_close(s->src);
+ bdrv_close(s->dst);
+
+ snprintf(mirror_name, sizeof(mirror_name)-1,
+ "blkmirror:%s:%s", s->dst->filename, s->src->filename);
+
+ ret = bdrv_open(s->src, mirror_name, s->src->open_flags, NULL);
+ if (ret < 0) {
+ error_report("%s: cannot open blkmirror device, err %d",
+ mirror_name, ret);
+ s->failed = 1;
+ goto err;
+ }
+
+ blkcopy_set_stage(s, STAGE_MIRROR_WRITES);
+ qemu_del_timer(s->aio_timer);
+
+ return;
+
+err:
+ if (bdrv_open(s->src, src_filename, open_flags, NULL) < 0) {
+ error_report("%s: %s: cannot fallback to source image\n", __func__,
+ s->src_filename);
+ abort();
+ }
+ blkcopy_cleanup(s);
+ return;
+}
+
+#define BLKCOPY_INFLIGHT 2
+
+/*
+ * To simplify the implementation, the IO completion callbacks do not
+ * handle stage control or submit IO for further blocks. A timer is used
+ * for such purpose.
+ */
+
+static void aio_timer(void *opaque)
+{
+ BdrvCopyState *s = opaque;
+
+ assert(s->cancelled == 0);
+ assert(s->stage < STAGE_MIRROR_WRITES);
+
+ if (s->error) {
+ handle_error(s);
+ return;
+ }
+
+ while (s->stage == STAGE_BULK) {
+ if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+ break;
+ }
+ if (blk_issue_reads_bulk(s)) {
+ blkcopy_set_stage(s, STAGE_BULK_FINISHED);
+ }
+ }
+
+ if (s->stage == STAGE_BULK_FINISHED) {
+ if (QLIST_EMPTY(&s->io_list)) {
+ blkcopy_set_stage(s, STAGE_DIRTY);
+ alloc_aio_bitmap(s);
+ }
+ }
+
+ while (s->stage == STAGE_DIRTY) {
+ if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+ break;
+ }
+ blk_issue_reads_dirty(s);
+ if (blkcopy_can_switch(s)) {
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_SWITCH_START);
+ blkcopy_switch(s);
+ return;
+ }
+ }
+}
+
+
+int do_bdrv_copy_switch(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+ const char *device = qdict_get_str(qdict, "device");
+ BdrvCopyState *s = NULL;
+ int open_flags;
+
+ QLIST_FOREACH(s, &block_copy_list, list) {
+ if (!strcmp(s->device_name, device)) {
+ if (s->stage != STAGE_MIRROR_WRITES) {
+ qerror_report(QERR_IN_PROGRESS, "block copy");
+ return -1;
+ }
+ break;
+ }
+ }
+
+ if (!s) {
+ qerror_report(QERR_DEVICE_NOT_FOUND, device);
+ return -1;
+ }
+
+ open_flags = s->src->open_flags;
+
+ /* switch from mirrored writes to destination only */
+ bdrv_flush_all();
+ bdrv_close(s->src);
+ if (bdrv_open(s->src, s->dst->filename, s->src->open_flags, NULL) < 0) {
+ s->failed = 1;
+ goto err;
+ }
+
+ blkcopy_set_stage(s, STAGE_SWITCH_FINISHED);
+ blkcopy_cleanup(s);
+ return 0;
+
+err:
+ if (bdrv_open(s->src, s->src_filename, open_flags, NULL) < 0) {
+ error_report("%s: %s: cannot fallback to source image\n", __func__,
+ s->src_filename);
+ abort();
+ }
+ return -1;
+}
+
+static BdrvCopyState *bdrv_copy_setup(const char *device, const char *filename,
+ bool shared_base, bool incoming)
+{
+ int64_t sectors;
+ BdrvCopyState *blkcopy, *safe;
+ BlockDriverState *src, *dst;
+
+ src = bdrv_find(device);
+ if (!src) {
+ qerror_report(QERR_DEVICE_NOT_FOUND, device);
+ return NULL;
+ }
+
+ dst = bdrv_new("");
+ if (bdrv_open(dst, filename, src->open_flags, NULL) < 0) {
+ bdrv_delete(dst);
+ qerror_report(QERR_OPEN_FILE_FAILED, filename);
+ return NULL;
+ }
+
+ QLIST_FOREACH_SAFE(blkcopy, &block_copy_list, list, safe) {
+ if (!strcmp(blkcopy->device_name, src->device_name)) {
+ if (blkcopy->stage == STAGE_SWITCH_FINISHED || blkcopy->failed) {
+ blkcopy_free(blkcopy);
+ } else {
+ qerror_report(QERR_IN_PROGRESS, "block copy");
+ bdrv_delete(dst);
+ return NULL;
+ }
+ }
+ }
+
+ sectors = bdrv_getlength(src) >> BDRV_SECTOR_BITS;
+ if (sectors != bdrv_getlength(dst) >> BDRV_SECTOR_BITS) {
+ qerror_report(QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS);
+ bdrv_delete(dst);
+ return NULL;
+ }
+
+ blkcopy = qemu_mallocz(sizeof(BdrvCopyState));
+ blkcopy->src = src;
+ blkcopy->dst = dst;
+ blkcopy->curr_sector = 0;
+ blkcopy->nr_sectors = sectors;
+ blkcopy_set_stage(blkcopy, STAGE_BULK);
+ blkcopy->aio_timer = qemu_new_timer_ms(rt_clock, aio_timer, blkcopy);
+ blkcopy->shared_base = shared_base;
+ blkcopy->incoming = incoming;
+ strncpy(blkcopy->device_name, blkcopy->src->device_name,
+ sizeof(blkcopy->device_name) - 1);
+ strncpy(blkcopy->src_filename, blkcopy->src->filename,
+ sizeof(blkcopy->src_filename) - 1);
+ strncpy(blkcopy->dst_filename, filename,
+ sizeof(blkcopy->dst_filename) - 1);
+
+ drive_get_ref(drive_get_by_blockdev(src));
+ bdrv_set_in_use(src, 1);
+ if (!incoming) {
+ qemu_mod_timer(blkcopy->aio_timer, qemu_get_clock_ms(rt_clock));
+ }
+
+ QLIST_INSERT_HEAD(&block_copy_list, blkcopy, list);
+ return blkcopy;
+}
+
+static void block_copy_vmchange(void *opaque, int running, int reason)
+{
+ BdrvCopyState *s = opaque;
+
+ if (!running) {
+ do {
+ qemu_aio_flush();
+ } while (!QLIST_EMPTY(&s->io_list));
+ qemu_del_timer(s->aio_timer);
+ s->stopped = 1;
+ } else if (s->stopped) {
+ s->stopped = 0;
+ qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+ }
+}
+
+#define BDRV_SAVE_VERSION 1
+
+static void block_copy_save(QEMUFile *f, void *opaque)
+{
+ uint8_t len;
+ BdrvCopyState *s = opaque;
+
+ len = strlen(s->device_name);
+ qemu_put_byte(f, len);
+ qemu_put_buffer(f, (uint8_t *)s->device_name, len);
+
+ qemu_put_byte(f, s->shared_base);
+ qemu_put_be64(f, s->curr_sector);
+ qemu_put_byte(f, s->stage);
+ if (s->stage < STAGE_MIRROR_WRITES) {
+ int bitmap_size = ((bdrv_getlength(s->src) >> BDRV_SECTOR_BITS) +
+ BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1) /
+ (BDRV_SECTORS_PER_DIRTY_CHUNK * 8);
+ qemu_put_be32(f, bitmap_size);
+ qemu_put_buffer(f, (uint8_t *)s->src->dirty_bitmap, bitmap_size);
+ }
+}
+
+static int block_copy_load(QEMUFile *f, void *opaque, int version_id)
+{
+ uint8_t len;
+ char device_name[DEV_LEN];
+ bool shared_base;
+ enum BdrvCopyStage stage;
+ BdrvCopyState *s;
+ uint8_t *bitmap;
+ int bitmap_size;
+ uint64_t curr_sector;
+
+ len = qemu_get_byte(f);
+ qemu_get_buffer(f, (uint8_t *)device_name, len);
+ device_name[len] = '\0';
+
+ shared_base = qemu_get_byte(f);
+ curr_sector = qemu_get_be64(f);
+ stage = qemu_get_byte(f);
+ if (stage < STAGE_MIRROR_WRITES) {
+ bitmap_size = qemu_get_be32(f);
+ bitmap = qemu_mallocz(bitmap_size);
+ qemu_get_buffer(f, bitmap, bitmap_size);
+ }
+
+ QLIST_FOREACH(s, &block_copy_list, list) {
+ if (!strcmp(s->device_name, device_name)) {
+ if (!s->incoming || s->shared_base != shared_base) {
+ return 0;
+ }
+ break;
+ }
+ }
+
+ if (!s) {
+ return 0;
+ }
+
+ s->incoming = false;
+ s->curr_sector = curr_sector;
+
+ if (stage < STAGE_MIRROR_WRITES) {
+ bdrv_set_dirty_tracking(s->src, 1);
+ memcpy(s->src->dirty_bitmap, bitmap, bitmap_size);
+ qemu_free(bitmap);
+ }
+ register_savevm(NULL, "block_copy", -1, BDRV_SAVE_VERSION,
+ block_copy_save, block_copy_load, s);
+ s->change_entry = qemu_add_vm_change_state_handler(block_copy_vmchange, s);
+ qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+
+ return 0;
+}
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+ const char *device = qdict_get_str(qdict, "device");
+ const char *filename = qdict_get_str(qdict, "filename");
+ bool shared_base = qdict_get_try_bool(qdict, "incremental", 0);
+ bool incoming = qdict_get_try_bool(qdict, "incoming", 0);
+ BdrvCopyState *s;
+
+ s = bdrv_copy_setup(device, filename, shared_base, incoming);
+ if (s) {
+ if (!incoming) {
+ bdrv_set_dirty_tracking(s->src, 1);
+ }
+ register_savevm(NULL, "block_copy", -1, BDRV_SAVE_VERSION,
+ block_copy_save, block_copy_load, s);
+ s->change_entry =
+ qemu_add_vm_change_state_handler(block_copy_vmchange, s);
+ }
+
+ return s ? 0 : -1;
+}
+
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+ BdrvCopyState *blkcopy, *s = NULL;
+ const char *device = qdict_get_str(qdict, "device");
+
+ QLIST_FOREACH(blkcopy, &block_copy_list, list) {
+ if (!strcmp(blkcopy->device_name, device)) {
+ s = blkcopy;
+ break;
+ }
+ }
+
+ if (!s || s->stage == STAGE_SWITCH_FINISHED || s->failed) {
+ qerror_report(QERR_DEVICE_NOT_FOUND, device);
+ return -1;
+ }
+
+ s->cancelled = 1;
+ do {
+ qemu_aio_flush();
+ } while (!QLIST_EMPTY(&s->io_list));
+ blkcopy_cleanup(s);
+ blkcopy_free(s);
+
+ return 0;
+}
+
+void do_info_blockcopy(Monitor *mon, QObject **ret_data)
+{
+ QList *c_list;
+ BdrvCopyState *s;
+
+ c_list = qlist_new();
+
+ QLIST_FOREACH(s, &block_copy_list, list) {
+ QObject *c_obj;
+ static const char *status[] = { "failed", "stopped", "active",
+ "mirrored", "completed" };
+ int i;
+
+ if (s->failed) {
+ i = 0;
+ } else if (s->stopped) {
+ i = 1;
+ } else if (s->stage < STAGE_MIRROR_WRITES) {
+ i = 2;
+ } else if (s->stage < STAGE_SWITCH_FINISHED) {
+ i = 3;
+ } else {
+ i = 4;
+ }
+
+ c_obj = qobject_from_jsonf("{ 'device': %s, 'status': %s }",
+ s->device_name, status[i]);
+
+ if (i == 2) {
+ QDict *dict = qobject_to_qdict(c_obj);
+ QObject *obj;
+
+ /* FIXME: add dirty stage progress? */
+ obj = qobject_from_jsonf("{ 'transferred': %lld, "
+ "'remaining': %lld,"
+ "'total': %lld }",
+ s->completed_sectors * BDRV_SECTOR_SIZE,
+ (s->nr_sectors - s->completed_sectors) *
+ BDRV_SECTOR_SIZE,
+ (s->nr_sectors * BDRV_SECTOR_SIZE));
+
+ qdict_put_obj(dict, "info", obj);
+ }
+ qlist_append_obj(c_list, c_obj);
+ }
+
+ *ret_data = QOBJECT(c_list);
+}
+
+static void blockcopy_print_dict(QObject *obj, void *opaque)
+{
+ QDict *c_dict;
+ Monitor *mon = opaque;
+
+ c_dict = qobject_to_qdict(obj);
+
+ monitor_printf(mon, "%s: status=%s\n",
+ qdict_get_str(c_dict, "device"),
+ qdict_get_str(c_dict, "status"));
+
+ if (qdict_haskey(c_dict, "info")) {
+ QDict *qdict = qobject_to_qdict(qdict_get(c_dict, "info"));
+
+ monitor_printf(mon, "transferred = %" PRIu64 " kbytes\n",
+ qdict_get_int(qdict, "transferred") >> 10);
+
+ monitor_printf(mon, "remaining = %" PRIu64 " kbytes\n",
+ qdict_get_int(qdict, "remaining") >> 10);
+
+ monitor_printf(mon, "total = %" PRIu64 " kbytes\n",
+ qdict_get_int(qdict, "total") >> 10);
+ }
+
+ monitor_printf(mon, "\n");
+}
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data)
+{
+ qlist_iter(qobject_to_qlist(data), blockcopy_print_dict, mon);
+}
+
+
+bool block_copy_active(void)
+{
+ BdrvCopyState *s;
+
+ QLIST_FOREACH(s, &block_copy_list, list) {
+ if (s->failed) {
+ continue;
+ }
+ if (s->stage < STAGE_SWITCH_FINISHED) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
===================================================================
@@ -0,0 +1,26 @@
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_COPY_H
+#define BLOCK_COPY_H
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_bdrv_copy_switch(Monitor *mon, const QDict *qdict, QObject **ret_data);
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data);
+void do_info_blockcopy(Monitor *mon, QObject **ret_data);
+
+bool block_copy_active(void);
+
+#endif /* BLOCK_COPY_H */
+
===================================================================
@@ -806,6 +806,68 @@ Set maximum speed to @var{value} (in byt
ETEXI
{
+ .name = "block_copy",
+ .args_type = "device:s,filename:s,incremental:-i,incoming:-m",
+ .params = "device filename [-i] [-m]",
+ .help = "live block copy device to image"
+ "\n\t\t\t -i for incremental copy "
+ "(base image shared between original and destination)\n"
+ "-m for incoming migration (stopped block copy instance)",
+ .user_print = monitor_user_noop,
+ .mhandler.cmd_new = do_bdrv_copy,
+ },
+
+STEXI
+@item block_copy @var{device} @var{filename} [-i] [-m]
+@findex block_copy
+Live copy block device @var{device} to image @var{filename}.
+ -i for incremental copy (base image is shared)
+ -m for incoming migration (stopped block copy instance)
+
+Destination image @var{filename} must be created with qemu-img prior
+to execution of this command, with image size equal to the original
+image size.
+
+Incremental copy allows the destination image @var{filename} to share
+a common base image with the original image. This option skips copying
+blocks which are not allocated in the original image.
+
+To support incoming migration, use -m to specify the device and destination
+image. This instance is stopped, waiting for data from migration to continue.
+ETEXI
+
+ {
+ .name = "block_copy_cancel",
+ .args_type = "device:s",
+ .params = "device",
+ .help = "cancel live block copy",
+ .user_print = monitor_user_noop,
+ .mhandler.cmd_new = do_bdrv_copy_cancel,
+ },
+
+STEXI
+@item block_copy_cancel @var{device}
+@findex block_copy_cancel
+Cancel live block copy on @var{device}.
+ETEXI
+
+ {
+ .name = "block_copy_switch",
+ .args_type = "device:s",
+ .params = "device",
+ .help = "finish live block copy",
+ .user_print = monitor_user_noop,
+ .mhandler.cmd_new = do_bdrv_copy_switch,
+ },
+
+STEXI
+@item block_copy_switch @var{device}
+@findex block_copy_switch
+Finish live block copy on @var{device} by switching
+to destination image.
+ETEXI
+
+ {
.name = "migrate_set_downtime",
.args_type = "value:T",
.params = "value",
@@ -1352,6 +1414,8 @@ show device tree
show qdev device model list
@item info roms
show roms
+@item info block-copy
+show block copy status
@end table
ETEXI
===================================================================
@@ -45,6 +45,7 @@
#include "balloon.h"
#include "qemu-timer.h"
#include "migration.h"
+#include "block-copy.h"
#include "kvm.h"
#include "acl.h"
#include "qint.h"
@@ -3101,6 +3102,14 @@ static const mon_cmd_t info_cmds[] = {
},
#endif
{
+ .name = "block-copy",
+ .args_type = "",
+ .params = "",
+ .help = "show block copy status",
+ .user_print = do_info_blockcopy_print,
+ .mhandler.info_new = do_info_blockcopy,
+ },
+ {
.name = NULL,
},
};
@@ -3242,6 +3251,14 @@ static const mon_cmd_t qmp_query_cmds[]
.mhandler.info_async = do_info_balloon,
.flags = MONITOR_CMD_ASYNC,
},
+ {
+ .name = "block-copy",
+ .args_type = "",
+ .params = "",
+ .help = "show block copy status",
+ .user_print = do_info_blockcopy_print,
+ .mhandler.info_new = do_info_blockcopy,
+ },
{ /* NULL */ },
};
===================================================================
@@ -581,6 +581,100 @@ Example:
EQMP
{
+ .name = "block_copy",
+ .args_type = "device:s,filename:s,incremental:-i,incoming:-m",
+ .params = "device filename [-i] [-m]",
+ .help = "live block copy device to image"
+ "\n\t\t\t -i for incremental copy "
+ "(base image shared between src and destination)\n"
+ "-m for incoming migration (stopped block copy instance)",
+ .user_print = monitor_user_noop,
+ .mhandler.cmd_new = do_bdrv_copy,
+ },
+
+SQMP
+block-copy
+-------
+
+Live block copy.
+
+Arguments:
+
+- "device": device name (json-string)
+- "filename": target image filename (json-string)
+- "incremental": incremental disk copy (json-bool, optional)
+- "incoming": incoming migration (json-bool, optional)
+
+Example:
+
+-> { "execute": "block_copy",
+ "arguments": { "device": "ide0-hd1",
+ "filename": "/mnt/new-disk.img",
+ } }
+
+<- { "return": {} }
+
+Notes:
+
+(1) The 'query-block-copy' command should be used to check block copy progress
+ and final result (this information is provided by the 'status' member)
+(2) Boolean argument "incremental" defaults to false
+
+EQMP
+
+ {
+ .name = "block_copy_cancel",
+ .args_type = "device:s",
+ .params = "device",
+ .help = "cancel live block copy",
+ .user_print = monitor_user_noop,
+ .mhandler.cmd_new = do_bdrv_copy_cancel,
+ },
+
+SQMP
+block_copy_cancel
+--------------
+
+Cancel live block copy.
+
+Arguments:
+
+- device: device name (json-string)
+
+Example:
+
+-> { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } }
+<- { "return": {} }
+
+EQMP
+
+ {
+ .name = "block_copy_switch",
+ .args_type = "device:s",
+ .params = "device",
+ .help = "finish live block copy",
+ .user_print = monitor_user_noop,
+ .mhandler.cmd_new = do_bdrv_copy_switch,
+ },
+
+SQMP
+block_copy_switch
+--------------
+
+Finish live block copy, switching device to destination image.
+
+Arguments:
+
+- device: device name (json-string)
+
+Example:
+
+-> { "execute": "block_copy_switch", "arguments": { "device": "ide0-hd1" } }
+<- { "return": {} }
+
+EQMP
+
+ {
.name = "netdev_add",
.args_type = "netdev:O",
.params = "[user|tap|socket],id=str[,prop=value][,...]",
@@ -1744,6 +1838,51 @@ Examples:
EQMP
SQMP
+query-block-copy
+-------------
+
+Live block copy status.
+
+Each block copy instance information is stored in a json-object and the returned
+value is a json-array of all instances.
+
+Each json-object contains the following:
+
+- "device": device name (json-string)
+- "status": block copy status (json-string)
+ - Possible values: "active", "failed", "mirrored", "completed", meaning:
+ - failed: block copy failed.
+ - stopped: block copy stopped.
+ - active: block copy active, copying to destination image.
+ - mirrored: block copy active, finished copying to destination
+ image, writes are mirrored.
+ - completed: block copy completed.
+
+- "info": A json-object with the statistics information, if status is "active":
+ - "percentage": percentage completed (json-int)
+
+Example:
+
+Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed:
+
+-> { "execute": "query-block-copy" }
+<- {
+ "return":[
+ {"device":"ide1-hd0",
+ "status":"active",
+ "info":{
+ "percentage":23,
+ }
+ },
+ {"device":"ide1-hd1",
+ "status":"failed"
+ }
+ ]
+ }
+
+EQMP
+
+SQMP
query-balloon
-------------
===================================================================
@@ -98,7 +98,7 @@ common-obj-y += buffered_file.o migratio
common-obj-y += qemu-char.o savevm.o #aio.o
common-obj-y += msmouse.o ps2.o
common-obj-y += qdev.o qdev-properties.o
-common-obj-y += block-migration.o iohandler.o
+common-obj-y += block-migration.o iohandler.o block-copy.o
common-obj-y += pflib.o
common-obj-y += bitmap.o bitops.o
===================================================================
@@ -0,0 +1,50 @@
+Live block copy
+===============
+
+Block copy allows an image to be copied to a destination image, while
+the guest is operating on the source image. The command is:
+
+block_copy {device} {filename} [-i][-m]
+
+Destination image {filename} must be created with qemu-img prior to
+execution of this command, with image size equal to the original image
+size.
+
+Incremental copy allows the destination image to share a common base
+image with the original image. This option skips copying blocks which
+are not allocated in the original image.
+
+
+Command flow
+============
+
+Once the copy initiated with the block_copy command has finished,
+writes by the guest to the source image will be mirrored to the
+destination image.
+
+This state is indicated by the "mirrored" status of a device, visible in
+query-block-copy command output.
+
+Once in the mirrored state, management application is able to issue
+the block_copy_switch command to complete the block copy operation.
+
+Follows a pseudo-algorithm:
+
+ block_copy(dev, file);
+ do {
+ query_block_copy(block_copy_state);
+ sleep(s)
+ } while (!block_copy_state.dev.mirrored);
+ block_copy_switch(dev);
+
+Migration
+=========
+
+It is necessary to specify active block copy instance in the destination
+VM before migration is performed. Example:
+
+1) start VM in incoming mode.
+2) for each active block copy instance on the source, run:
+ (qemu) block_copy device /path/to/image.dst [-i] -m
+
+
Support live image copy + switch. That is, copy an image backing a guest hard disk to a destination image (destination image must be created separately), and switch to this copy. Command syntax: block_copy device filename [-i] -- live block copy device to image -i for incremental copy (base image shared between src and destination) Please refer to qmp-commands diff for more details. Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>