Message ID | 20110222170115.710717278@redhat.com |
---|---|
State | New |
Headers | show |
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote: > Support live image copy + switch. That is, copy an image backing > a guest hard disk to a destination image (destination image must > be created separately), and switch to this copy. > > Command syntax: > > block_copy device filename [commit_filename] [-i] -- live block copy device to image > optional commit filename > -i for incremental copy (base image shared between src and destination) > > Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com> > > Index: qemu/block-copy.c > =================================================================== > --- /dev/null > +++ qemu/block-copy.c > @@ -0,0 +1,741 @@ > +/* > + * QEMU live block copy > + * > + * Copyright (C) 2010 Red Hat Inc. > + * > + * Authors: Marcelo Tosatti<mtosatti@redhat.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include "qemu-common.h" > +#include "block_int.h" > +#include "blockdev.h" > +#include "qemu-queue.h" > +#include "qemu-timer.h" > +#include "monitor.h" > +#include "block-copy.h" > +#include "migration.h" > +#include "sysemu.h" > +#include "qjson.h" > +#include<assert.h> > + > +#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK<< BDRV_SECTOR_BITS) > +#define MAX_IS_ALLOCATED_SEARCH 65536 > + > +/* > + * Stages: > + * > + * STAGE_BULK: bulk reads/writes in progress > + * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress > + * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress > + * STAGE_SWITCH_FINISHED: switched to new image. > + */ > + > +enum BdrvCopyStage { > + STAGE_BULK, > + STAGE_BULK_FINISHED, > + STAGE_DIRTY, > + STAGE_SWITCH_FINISHED, > +}; > + > +typedef struct BdrvCopyState { > + BlockDriverState *src; > + BlockDriverState *dst; > + bool shared_base; > + > + int64_t curr_sector; > + int64_t completed_sectors; > + int64_t nr_sectors; > + > + enum BdrvCopyStage stage; > + int inflight_reads; > + int error; > + int failed; > + int cancelled; > + QLIST_HEAD(, BdrvCopyBlock) io_list; > + unsigned long *aio_bitmap; > + QEMUTimer *aio_timer; > + QLIST_ENTRY(BdrvCopyState) list; > + > + int64_t blocks; > + int64_t total_time; > + > + char src_device_name[32]; > + char dst_filename[1024]; > + int commit_fd; > +} BdrvCopyState; > + > +typedef struct BdrvCopyBlock { > + BdrvCopyState *state; > + uint8_t *buf; > + int64_t sector; > + int64_t nr_sectors; > + struct iovec iov; > + QEMUIOVector qiov; > + BlockDriverAIOCB *aiocb; > + int64_t time; > + QLIST_ENTRY(BdrvCopyBlock) list; > +} BdrvCopyBlock; > + > +static QLIST_HEAD(, BdrvCopyState) block_copy_list = > + QLIST_HEAD_INITIALIZER(block_copy_list); > + > +static void alloc_aio_bitmap(BdrvCopyState *s) > +{ > + BlockDriverState *bs = s->src; > + int64_t bitmap_size; > + > + bitmap_size = (bdrv_getlength(bs)>> BDRV_SECTOR_BITS) + > + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; > + bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; > + > + s->aio_bitmap = qemu_mallocz(bitmap_size); > +} > + > +static bool aio_inflight(BdrvCopyState *s, int64_t sector) > +{ > + int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; > + > + if (s->aio_bitmap&& > + (sector<< BDRV_SECTOR_BITS)< bdrv_getlength(s->src)) { > + return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)]& > + (1UL<< (chunk % (sizeof(unsigned long) * 8)))); > + } else { > + return 0; > + } > +} > + > +static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num, > + int nb_sectors, int set) > +{ > + int64_t start, end; > + unsigned long val, idx, bit; > + > + start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; > + end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; > + > + for (; start<= end; start++) { > + idx = start / (sizeof(unsigned long) * 8); > + bit = start % (sizeof(unsigned long) * 8); > + val = s->aio_bitmap[idx]; > + if (set) { > + if (!(val& (1UL<< bit))) { > + val |= 1UL<< bit; > + } > + } else { > + if (val& (1UL<< bit)) { > + val&= ~(1UL<< bit); > + } > + } > + s->aio_bitmap[idx] = val; > + } > +} > + > +static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage) > +{ > + s->stage = stage; > + > + switch (stage) { > + case STAGE_BULK: > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK); > + break; > + case STAGE_BULK_FINISHED: > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED); > + break; > + case STAGE_DIRTY: > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY); > + break; > + case STAGE_SWITCH_FINISHED: > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED); > + break; > + default: > + break; > + } > +} > + > +static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret) > +{ > + s->error = ret; > + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); > +} > + > +static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time) > +{ > + s->blocks++; > + s->total_time += time; > +} > + > +static void blk_copy_write_cb(void *opaque, int ret) > +{ > + BdrvCopyBlock *blk = opaque; > + BdrvCopyState *s = blk->state; > + > + if (ret< 0) { > + QLIST_REMOVE(blk, list); > + qemu_free(blk->buf); > + qemu_free(blk); > + blk_copy_handle_cb_error(s, ret); > + return; > + } > + > + QLIST_REMOVE(blk, list); > + add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time); > + > + /* schedule switch to STAGE_DIRTY on last bulk write completion */ > + if (blk->state->stage == STAGE_BULK_FINISHED) { > + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); > + } > + > + if (blk->state->stage> STAGE_BULK_FINISHED) { > + set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0); > + } > + > + qemu_free(blk->buf); > + qemu_free(blk); > +} > + > +static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk) > +{ > + BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock)); > + blk->state = s; > + blk->sector = read_blk->sector; > + blk->nr_sectors = read_blk->nr_sectors; > + blk->time = read_blk->time; > + blk->buf = read_blk->buf; > + QLIST_INSERT_HEAD(&s->io_list, blk, list); > + > + blk->iov.iov_base = read_blk->buf; > + blk->iov.iov_len = read_blk->iov.iov_len; > + qemu_iovec_init_external(&blk->qiov,&blk->iov, 1); > + > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE); > + blk->aiocb = bdrv_aio_writev(s->dst, blk->sector,&blk->qiov, > + blk->iov.iov_len / BDRV_SECTOR_SIZE, > + blk_copy_write_cb, blk); > + if (!blk->aiocb) { > + s->error = 1; > + goto error; > + } > + > + return; > + > +error: > + QLIST_REMOVE(blk, list); > + qemu_free(read_blk->buf); > + qemu_free(blk); > +} > + > +static void blk_copy_read_cb(void *opaque, int ret) > +{ > + BdrvCopyBlock *blk = opaque; > + BdrvCopyState *s = blk->state; > + > + s->inflight_reads--; > + if (ret< 0) { > + QLIST_REMOVE(blk, list); > + qemu_free(blk->buf); > + qemu_free(blk); > + blk_copy_handle_cb_error(s, ret); > + return; > + } > + blk_copy_issue_write(s, blk); > + QLIST_REMOVE(blk, list); > + qemu_free(blk); > + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); > +} > + > +static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector, > + int nr_sectors) > +{ > + BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock)); > + blk->buf = qemu_mallocz(BLOCK_SIZE); > + blk->state = s; > + blk->sector = sector; > + blk->nr_sectors = nr_sectors; > + QLIST_INSERT_HEAD(&s->io_list, blk, list); > + > + blk->iov.iov_base = blk->buf; > + blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; > + qemu_iovec_init_external(&blk->qiov,&blk->iov, 1); > + > + s->inflight_reads++; > + blk->time = qemu_get_clock_ns(rt_clock); > + blk->aiocb = bdrv_aio_readv(s->src, sector,&blk->qiov, nr_sectors, > + blk_copy_read_cb, blk); > + if (!blk->aiocb) { > + s->error = 1; > + goto error; > + } > + > + return; > + > +error: > + s->inflight_reads--; > + QLIST_REMOVE(blk, list); > + qemu_free(blk->buf); > + qemu_free(blk); > +} > + > +static bool blkcopy_can_switch(BdrvCopyState *s) > +{ > + int64_t remaining_dirty; > + int64_t avg_transfer_time; > + > + remaining_dirty = bdrv_get_dirty_count(s->src); > + if (remaining_dirty == 0 || s->blocks == 0) { > + return true; > + } > + > + avg_transfer_time = s->total_time / s->blocks; > + if ((remaining_dirty * avg_transfer_time)<= migrate_max_downtime()) { > + return true; > + } > + return false; > +} > + > +static int blk_issue_reads_dirty(BdrvCopyState *s) > +{ > + int64_t sector; > + > + for (sector = s->curr_sector; sector< s->nr_sectors;) { > + if (bdrv_get_dirty(s->src, sector)&& !aio_inflight(s, sector)) { > + int nr_sectors = MIN(s->nr_sectors - s->curr_sector, > + BDRV_SECTORS_PER_DIRTY_CHUNK); > + > + blk_copy_issue_read(s, sector, nr_sectors); > + bdrv_reset_dirty(s->src, sector, nr_sectors); > + set_aio_inflight(s, sector, nr_sectors, 1); > + break; > + } > + > + sector += BDRV_SECTORS_PER_DIRTY_CHUNK; > + s->curr_sector = sector; > + } > + > + if (sector>= s->nr_sectors) { > + s->curr_sector = 0; > + } > + return 0; > +} > + > +static int blk_issue_reads_bulk(BdrvCopyState *s) > +{ > + int nr_sectors; > + int64_t curr_sector = s->curr_sector; > + > + if (s->shared_base) { > + while (curr_sector< s->nr_sectors&& > + !bdrv_is_allocated(s->src, curr_sector, > + MAX_IS_ALLOCATED_SEARCH,&nr_sectors)) { > + curr_sector += nr_sectors; > + } > + } > + > + if (curr_sector>= s->nr_sectors) { > + s->curr_sector = 0; > + return 1; > + } > + > + curr_sector&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); > + nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; > + > + blk_copy_issue_read(s, s->curr_sector, nr_sectors); > + s->curr_sector += nr_sectors; > + s->completed_sectors = curr_sector; > + return 0; > +} > + > +static void blkcopy_finish(BdrvCopyState *s) > +{ > + int64_t sector; > + uint8_t *buf; > + > + buf = qemu_malloc(BLOCK_SIZE); > + > + /* FIXME: speed up loop, get_next_dirty_block? */ > + for (sector = 0; sector< s->nr_sectors; > + sector += BDRV_SECTORS_PER_DIRTY_CHUNK) { > + if (bdrv_get_dirty(s->src, sector)) { > + int nr_sectors = MIN(s->nr_sectors - sector, > + BDRV_SECTORS_PER_DIRTY_CHUNK); > + > + memset(buf, 0, BLOCK_SIZE); > + if (bdrv_read(s->src, sector, buf, nr_sectors)< 0) { > + goto error; > + } > + if (bdrv_write(s->dst, sector, buf, nr_sectors)< 0) { > + goto error; > + } > + bdrv_reset_dirty(s->src, sector, nr_sectors); > + } > + > + if (bdrv_get_dirty_count(s->src) == 0) > + break; > + } > + qemu_free(buf); > + return; > + > +error: > + qemu_free(buf); > + s->error = 1; > +} > + > +static int write_commit_file(BdrvCopyState *s) > +{ > + char commit_msg[1400]; > + const char *buf = commit_msg; > + int len, ret; > + > + sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name, > + s->dst_filename); > + > + len = strlen(commit_msg); > + while (len> 0) { > + ret = write(s->commit_fd, buf, len); > + if (ret == -1&& errno == EINTR) { > + continue; > + } > + if (ret<= 0) { > + return -errno; > + } > + buf += ret; > + len -= ret; > + } > + > + if (fsync(s->commit_fd) == -1) { > + return -errno; > + } > > This is more or less black magic. What is this commit file used for and why aren't we using something like a QMP event? Regards, Anthony Liguori
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote: This patch is large and not properly inlined. Can it be split up into more reviewable chunks? Regards, Anthony Liguori
On Tue, Feb 22, 2011 at 02:50:09PM -0600, Anthony Liguori wrote: > >+static int write_commit_file(BdrvCopyState *s) > >+{ > >+ char commit_msg[1400]; > >+ const char *buf = commit_msg; > >+ int len, ret; > >+ > >+ sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name, > >+ s->dst_filename); > >+ > >+ len = strlen(commit_msg); > >+ while (len> 0) { > >+ ret = write(s->commit_fd, buf, len); > >+ if (ret == -1&& errno == EINTR) { > >+ continue; > >+ } > >+ if (ret<= 0) { > >+ return -errno; > >+ } > >+ buf += ret; > >+ len -= ret; > >+ } > >+ > >+ if (fsync(s->commit_fd) == -1) { > >+ return -errno; > >+ } > > > > This is more or less black magic. What is this commit file used for > and why aren't we using something like a QMP event? The commit file is considered reliable storage for the result of image switch operation. Think of the following scenario: - mgmt app requests live copy of guests ide1-hd0 from /a/image.img to /b/image.img. - mgmt app dies. - guest switches to /b/image.img, /a/image.img is outdated. - guest dies. Notifying the switch via QMP would not be reliable in this case. Will add this as a comment in the code.
On 02/22/2011 03:07 PM, Marcelo Tosatti wrote: > On Tue, Feb 22, 2011 at 02:50:09PM -0600, Anthony Liguori wrote: > >>> +static int write_commit_file(BdrvCopyState *s) >>> +{ >>> + char commit_msg[1400]; >>> + const char *buf = commit_msg; >>> + int len, ret; >>> + >>> + sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name, >>> + s->dst_filename); >>> + >>> + len = strlen(commit_msg); >>> + while (len> 0) { >>> + ret = write(s->commit_fd, buf, len); >>> + if (ret == -1&& errno == EINTR) { >>> + continue; >>> + } >>> + if (ret<= 0) { >>> + return -errno; >>> + } >>> + buf += ret; >>> + len -= ret; >>> + } >>> + >>> + if (fsync(s->commit_fd) == -1) { >>> + return -errno; >>> + } >>> >>> >> This is more or less black magic. What is this commit file used for >> and why aren't we using something like a QMP event? >> > The commit file is considered reliable storage for the result of image > switch operation. Think of the following scenario: > > - mgmt app requests live copy of guests ide1-hd0 > from /a/image.img to /b/image.img. > - mgmt app dies. > - guest switches to /b/image.img, /a/image.img is outdated. > - guest dies. > > Notifying the switch via QMP would not be reliable in this case. > But this is true of any number of operations in QEMU such as hotplug where if a management tool dies after requesting hotplug and then the guest dies before the management tool reconnects, it's never known whether it's truly connected or not. The only way to robustly solve this is for QEMU to maintain a stateful config file. A one-off for this particular command doesn't seem wise to me. Regards, Anthony Liguori > Will add this as a comment in the code. > >
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote: > + > +static void blkcopy_switch(BdrvCopyState *s) > +{ > + char src_filename[1024]; > + int open_flags; > + > + strncpy(src_filename, s->src->filename, sizeof(src_filename)); > + open_flags = s->src->open_flags; > strncpy doesn't leave a NULL terminator if it runs out of space. I don't think that's your desire here. Regards, Anthony Liguori
On Tue, Feb 22, 2011 at 03:11:06PM -0600, Anthony Liguori wrote: > On 02/22/2011 03:07 PM, Marcelo Tosatti wrote: > >On Tue, Feb 22, 2011 at 02:50:09PM -0600, Anthony Liguori wrote: > >>>+static int write_commit_file(BdrvCopyState *s) > >>>+{ > >>>+ char commit_msg[1400]; > >>>+ const char *buf = commit_msg; > >>>+ int len, ret; > >>>+ > >>>+ sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name, > >>>+ s->dst_filename); > >>>+ > >>>+ len = strlen(commit_msg); > >>>+ while (len> 0) { > >>>+ ret = write(s->commit_fd, buf, len); > >>>+ if (ret == -1&& errno == EINTR) { > >>>+ continue; > >>>+ } > >>>+ if (ret<= 0) { > >>>+ return -errno; > >>>+ } > >>>+ buf += ret; > >>>+ len -= ret; > >>>+ } > >>>+ > >>>+ if (fsync(s->commit_fd) == -1) { > >>>+ return -errno; > >>>+ } > >>> > >>This is more or less black magic. What is this commit file used for > >>and why aren't we using something like a QMP event? > >The commit file is considered reliable storage for the result of image > >switch operation. Think of the following scenario: > > > >- mgmt app requests live copy of guests ide1-hd0 > >from /a/image.img to /b/image.img. > >- mgmt app dies. > >- guest switches to /b/image.img, /a/image.img is outdated. > >- guest dies. > > > >Notifying the switch via QMP would not be reliable in this case. > > But this is true of any number of operations in QEMU such as hotplug > where if a management tool dies after requesting hotplug and then > the guest dies before the management tool reconnects, it's never > known whether it's truly connected or not. This is a different case. The mgmt tool should be able to safely store the desired device tree before hotplugging a device. And even if you consider that should be done by QEMU, in a config file, restarting the guest with or without the device is not going to result in data corruption/loss. Image switch is critical because using an outdated image will result in data corruption. > The only way to robustly solve this is for QEMU to maintain a > stateful config file. A one-off for this particular command doesn't > seem wise to me. I don't disagree the commit message could be in a config file, but since that does not exist, a separate file is a workable solution. Also, the separate commit file is not incompatible with future improvements.
On 02/22/2011 05:09 PM, Marcelo Tosatti wrote: > On Tue, Feb 22, 2011 at 03:11:06PM -0600, Anthony Liguori wrote: > >> On 02/22/2011 03:07 PM, Marcelo Tosatti wrote: >> >>> On Tue, Feb 22, 2011 at 02:50:09PM -0600, Anthony Liguori wrote: >>> >>>>> +static int write_commit_file(BdrvCopyState *s) >>>>> +{ >>>>> + char commit_msg[1400]; >>>>> + const char *buf = commit_msg; >>>>> + int len, ret; >>>>> + >>>>> + sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name, >>>>> + s->dst_filename); >>>>> + >>>>> + len = strlen(commit_msg); >>>>> + while (len> 0) { >>>>> + ret = write(s->commit_fd, buf, len); >>>>> + if (ret == -1&& errno == EINTR) { >>>>> + continue; >>>>> + } >>>>> + if (ret<= 0) { >>>>> + return -errno; >>>>> + } >>>>> + buf += ret; >>>>> + len -= ret; >>>>> + } >>>>> + >>>>> + if (fsync(s->commit_fd) == -1) { >>>>> + return -errno; >>>>> + } >>>>> >>>>> >>>> This is more or less black magic. What is this commit file used for >>>> and why aren't we using something like a QMP event? >>>> >>> The commit file is considered reliable storage for the result of image >>> switch operation. Think of the following scenario: >>> >>> - mgmt app requests live copy of guests ide1-hd0 >>> >> >from /a/image.img to /b/image.img. >> >>> - mgmt app dies. >>> - guest switches to /b/image.img, /a/image.img is outdated. >>> - guest dies. >>> >>> Notifying the switch via QMP would not be reliable in this case. >>> >> But this is true of any number of operations in QEMU such as hotplug >> where if a management tool dies after requesting hotplug and then >> the guest dies before the management tool reconnects, it's never >> known whether it's truly connected or not. >> > This is a different case. The mgmt tool should be able to safely store > the desired device tree before hotplugging a device. > > And even if you consider that should be done by QEMU, in a config file, > restarting the guest with or without the device is not going to result > in data corruption/loss. > If a guest started writing to a disk and is storing important data there, and then the guest is restarted and the disk (and data) isn't there, I'd classify that as data loss. >> The only way to robustly solve this is for QEMU to maintain a >> stateful config file. A one-off for this particular command doesn't >> seem wise to me. >> > I don't disagree the commit message could be in a config file, but since > that does not exist, a separate file is a workable solution. I disagree. This is something we'll have to maintain forever and what's to stop us from having additional one-off mechanisms to deal with this same problem. -drive already ties into the qemuopts infrastructure and we have readconfig and writeconfig. I don't think we're missing any major pieces to do this in a more proper fashion. Regards, Anthony Liguori > Also, the > separate commit file is not incompatible with future improvements. > >
On 02/22/2011 11:11 PM, Anthony Liguori wrote: >> The commit file is considered reliable storage for the result of image >> switch operation. Think of the following scenario: >> >> - mgmt app requests live copy of guests ide1-hd0 >> from /a/image.img to /b/image.img. >> - mgmt app dies. >> - guest switches to /b/image.img, /a/image.img is outdated. >> - guest dies. >> >> Notifying the switch via QMP would not be reliable in this case. > > But this is true of any number of operations in QEMU such as hotplug > where if a management tool dies after requesting hotplug and then the > guest dies before the management tool reconnects, it's never known > whether it's truly connected or not. > > The only way to robustly solve this is for QEMU to maintain a stateful > config file. Or, the management tool can remember that it tries to hotplug, reconnect, query qemu (or just retry), and proceed according to the result: - insert record in config for the new device, status = inactive - commit - tell qemu to hotplug <crash> - restart, read database, see inactive device - query qemu - if device is there, mark as active, commit - else, tell qemu to hotplug, when successful, mark as active, commit > A one-off for this particular command doesn't seem wise to me. Strongly agreed.
On 02/23/2011 01:14 AM, Anthony Liguori wrote: > > -drive already ties into the qemuopts infrastructure and we have > readconfig and writeconfig. I don't think we're missing any major > pieces to do this in a more proper fashion. The problem with qemu config files is that it splits the authoritative source of where images are stored into two. Is it in the management tool's database or is it in qemu's config file? For the problem at hand, one solution is to make qemu stop after the copy, and then management can issue an additional command to rearrange the disk and resume the guest. A drawback here is that if management dies, the guest is stopped until it restarts. We also make management latency guest visible, even if it doesn't die at an inconvenient place. An alternative approach is to have the copy be performed by a new layered block format driver: - create a new image, type = live-copy, containing three pieces of information - source image - destination image - copy state (initially nothing is copied) - tell qemu switch to the new image - qemu starts copying, updates copy state as needed - copy finishes, event is emitted; reads and writes still serviced - management receives event, switches qemu to destination image - management removes live-copy image If management dies while this is happening, it can simply query the state of the copy. Similarly, if qemu dies, the copy state is persistent (could be 0/1 or real range of blocks).
On 02/23/2011 07:01 AM, Avi Kivity wrote: > On 02/23/2011 01:14 AM, Anthony Liguori wrote: >> >> -drive already ties into the qemuopts infrastructure and we have >> readconfig and writeconfig. I don't think we're missing any major >> pieces to do this in a more proper fashion. > > The problem with qemu config files is that it splits the authoritative > source of where images are stored into two. Is it in the management > tool's database or is it in qemu's config file? I like to use the phrase "stateful config file". To me, it's just a database for QEMU to persist data about the VM. It's the only way for QEMU to make certain transactions atomic in the face of QEMU crashing. The user visible config file is a totally different concept. A management tool launches QEMU and tells it where to keep it's state database. The management application may prepopulate the state database or it may just use an empty file. QEMU uses the state database to store information that is created dynamically. For instance, devices added through device_add. A device added via -device wouldn't necessary get added to the state database. Practically speaking, it let's you invoke QEMU with a fixed command line, while still using the monitor to make changes that would otherwise require the command line being updated. > For the problem at hand, one solution is to make qemu stop after the > copy, and then management can issue an additional command to rearrange > the disk and resume the guest. A drawback here is that if management > dies, the guest is stopped until it restarts. We also make management > latency guest visible, even if it doesn't die at an inconvenient place. > > An alternative approach is to have the copy be performed by a new > layered block format driver: > > - create a new image, type = live-copy, containing three pieces of > information > - source image > - destination image > - copy state (initially nothing is copied) > - tell qemu switch to the new image > - qemu starts copying, updates copy state as needed > - copy finishes, event is emitted; reads and writes still serviced > - management receives event, switches qemu to destination image > - management removes live-copy image > > If management dies while this is happening, it can simply query the > state of the copy. Similarly, if qemu dies, the copy state is > persistent (could be 0/1 or real range of blocks). This is a more elegant solution to the problem than the commit problem but it's also a one-off. I think we have a generic problem here and we ought to try to solve it generically (within reason). Regards, Anthony Liguori
On 02/23/2011 04:35 PM, Anthony Liguori wrote: > On 02/23/2011 07:01 AM, Avi Kivity wrote: >> On 02/23/2011 01:14 AM, Anthony Liguori wrote: >>> >>> -drive already ties into the qemuopts infrastructure and we have >>> readconfig and writeconfig. I don't think we're missing any major >>> pieces to do this in a more proper fashion. >> >> The problem with qemu config files is that it splits the >> authoritative source of where images are stored into two. Is it in >> the management tool's database or is it in qemu's config file? > > I like to use the phrase "stateful config file". To me, it's just a > database for QEMU to persist data about the VM. It's the only way for > QEMU to make certain transactions atomic in the face of QEMU crashing. > > The user visible config file is a totally different concept. A > management tool launches QEMU and tells it where to keep it's state > database. The management application may prepopulate the state > database or it may just use an empty file. In that case the word 'config' is misleading. To me, it implies that the user configures something, and qemu reads it, not something mostly internal to qemu. Qemu does keep state. Currently only images, but in theory also the on-board NVRAM. > > QEMU uses the state database to store information that is created > dynamically. For instance, devices added through device_add. A > device added via -device wouldn't necessary get added to the state > database. > > Practically speaking, it let's you invoke QEMU with a fixed command > line, while still using the monitor to make changes that would > otherwise require the command line being updated. Then the invoker quickly loses track of what the actual state is. It can't just remember which commands it issued (presumably in response to the user updating user visible state). It has to parse the stateful config file qemu outputs. But at which points should it parse it? I don't think it's reasonable to have three different ways to interact with qemu, all needed: the command line, reading and writing the stateful config file, and the monitor. I'd rather push for starting qemu with a blank guest and assembling (cold-plugging) all the hardware via the monitor before starting the guest. >> For the problem at hand, one solution is to make qemu stop after the >> copy, and then management can issue an additional command to >> rearrange the disk and resume the guest. A drawback here is that if >> management dies, the guest is stopped until it restarts. We also >> make management latency guest visible, even if it doesn't die at an >> inconvenient place. >> >> An alternative approach is to have the copy be performed by a new >> layered block format driver: >> >> - create a new image, type = live-copy, containing three pieces of >> information >> - source image >> - destination image >> - copy state (initially nothing is copied) >> - tell qemu switch to the new image >> - qemu starts copying, updates copy state as needed >> - copy finishes, event is emitted; reads and writes still serviced >> - management receives event, switches qemu to destination image >> - management removes live-copy image >> >> If management dies while this is happening, it can simply query the >> state of the copy. Similarly, if qemu dies, the copy state is >> persistent (could be 0/1 or real range of blocks). > > This is a more elegant solution to the problem than the commit problem > but it's also a one-off. I think we have a generic problem here and > we ought to try to solve it generically (within reason). Can you give more examples? I think I demonstrated that hot-plug can be solved via the existing interfaces.
On 02/23/2011 09:31 AM, Avi Kivity wrote: > On 02/23/2011 04:35 PM, Anthony Liguori wrote: >> On 02/23/2011 07:01 AM, Avi Kivity wrote: >>> On 02/23/2011 01:14 AM, Anthony Liguori wrote: >>>> >>>> -drive already ties into the qemuopts infrastructure and we have >>>> readconfig and writeconfig. I don't think we're missing any major >>>> pieces to do this in a more proper fashion. >>> >>> The problem with qemu config files is that it splits the >>> authoritative source of where images are stored into two. Is it in >>> the management tool's database or is it in qemu's config file? >> >> I like to use the phrase "stateful config file". To me, it's just a >> database for QEMU to persist data about the VM. It's the only way >> for QEMU to make certain transactions atomic in the face of QEMU >> crashing. >> >> The user visible config file is a totally different concept. A >> management tool launches QEMU and tells it where to keep it's state >> database. The management application may prepopulate the state >> database or it may just use an empty file. > > In that case the word 'config' is misleading. To me, it implies that > the user configures something, and qemu reads it, not something mostly > internal to qemu. Understood. > > Qemu does keep state. Currently only images, but in theory also the > on-board NVRAM. Yeah, this is a good example of an area where a "stateful config file" would be useful. I like the idea of storing this sort of thing in a text file with a config structure because a user certainly wants to be able to specify the boot order. Being able to tweak this kind of stuff adds a lot of interesting capabilities. >> >> QEMU uses the state database to store information that is created >> dynamically. For instance, devices added through device_add. A >> device added via -device wouldn't necessary get added to the state >> database. >> >> Practically speaking, it let's you invoke QEMU with a fixed command >> line, while still using the monitor to make changes that would >> otherwise require the command line being updated. > > Then the invoker quickly loses track of what the actual state is. It > can't just remember which commands it issued (presumably in response > to the user updating user visible state). It has to parse the > stateful config file qemu outputs. Well specifically, it has to ask QEMU and QEMU can tell it the current state via a nice structured data format over QMP. It's a hell of a lot easier than the management tool trying to do this outside of QEMU. > But at which points should it parse it? I was thinking that we should post events whenever we change the stateful config. That would let the management tool have a mechanism for determining when settings have been changed. Of course, if the management tool crashes, it should re-read at startup. > I don't think it's reasonable to have three different ways to interact > with qemu, all needed: the command line, reading and writing the > stateful config file, and the monitor. I'd rather push for starting > qemu with a blank guest and assembling (cold-plugging) all the > hardware via the monitor before starting the guest. Yes. I view the command line as optional. To me, this is the ideal interaction: 1) start qemu with an empty stateful config file 2) issue monitor commands to create all devices and backends 3) the stateful config file totally captures the state of all of the issued QMP commands. The management tool can relaunch the guest just by passing the stateful config file to QEMU. 4) when the management tool needs to "extract" a config file, it can read the stateful config (through the monitor) and generate it's own config. 5) the management tool should treat the stateful config file as more or less opaque. It shouldn't be visible to end user. In the non-managed case, users should interact directly with the config file. >>> For the problem at hand, one solution is to make qemu stop after the >>> copy, and then management can issue an additional command to >>> rearrange the disk and resume the guest. A drawback here is that if >>> management dies, the guest is stopped until it restarts. We also >>> make management latency guest visible, even if it doesn't die at an >>> inconvenient place. >>> >>> An alternative approach is to have the copy be performed by a new >>> layered block format driver: >>> >>> - create a new image, type = live-copy, containing three pieces of >>> information >>> - source image >>> - destination image >>> - copy state (initially nothing is copied) >>> - tell qemu switch to the new image >>> - qemu starts copying, updates copy state as needed >>> - copy finishes, event is emitted; reads and writes still serviced >>> - management receives event, switches qemu to destination image >>> - management removes live-copy image >>> >>> If management dies while this is happening, it can simply query the >>> state of the copy. Similarly, if qemu dies, the copy state is >>> persistent (could be 0/1 or real range of blocks). >> >> This is a more elegant solution to the problem than the commit >> problem but it's also a one-off. I think we have a generic problem >> here and we ought to try to solve it generically (within reason). > > Can you give more examples? > > I think I demonstrated that hot-plug can be solved via the existing > interfaces. Sure. CMOS settings right now are not persisted across reboot. Guest initiated activities like IDE or PCI eject are tricky to persist correctly within a management tool. We could add events for all of this things but it's all racy since events are posted. If we have a stateful config file, we can make all of these things non-racy and post an event that the config has changed. If there's a crash, the management tool can read the config on startup to catch up on missed events. I think the nature of a posted event management interface is such that we need a stateful config that persists across QEMU invocations. Regards, Anthony Liguori
On 02/23/2011 06:01 PM, Anthony Liguori wrote: > >> >> Qemu does keep state. Currently only images, but in theory also the >> on-board NVRAM. > > Yeah, this is a good example of an area where a "stateful config file" > would be useful. I like the idea of storing this sort of thing in a > text file with a config structure because a user certainly wants to be > able to specify the boot order. Being able to tweak this kind of > stuff adds a lot of interesting capabilities. My preference would be a binary file (a disk image, in fact), with a tool to play with the known fields. It allows a management tool to reuse its storage stack. > >>> >>> QEMU uses the state database to store information that is created >>> dynamically. For instance, devices added through device_add. A >>> device added via -device wouldn't necessary get added to the state >>> database. >>> >>> Practically speaking, it let's you invoke QEMU with a fixed command >>> line, while still using the monitor to make changes that would >>> otherwise require the command line being updated. >> >> Then the invoker quickly loses track of what the actual state is. It >> can't just remember which commands it issued (presumably in response >> to the user updating user visible state). It has to parse the >> stateful config file qemu outputs. > > Well specifically, it has to ask QEMU and QEMU can tell it the current > state via a nice structured data format over QMP. It's a hell of a > lot easier than the management tool trying to do this outside of QEMU. So, if qemu crashes, the management tool has to start it up to find out what the current state is. > >> But at which points should it parse it? > > I was thinking that we should post events whenever we change the > stateful config. That would let the management tool have a mechanism > for determining when settings have been changed. Of course, if the > management tool crashes, it should re-read at startup. > >> I don't think it's reasonable to have three different ways to >> interact with qemu, all needed: the command line, reading and writing >> the stateful config file, and the monitor. I'd rather push for >> starting qemu with a blank guest and assembling (cold-plugging) all >> the hardware via the monitor before starting the guest. > > Yes. I view the command line as optional. To me, this is the ideal > interaction: > > 1) start qemu with an empty stateful config file > > 2) issue monitor commands to create all devices and backends > > 3) the stateful config file totally captures the state of all of the > issued QMP commands. The management tool can relaunch the guest just > by passing the stateful config file to QEMU. > > 4) when the management tool needs to "extract" a config file, it can > read the stateful config (through the monitor) and generate it's own > config. > > 5) the management tool should treat the stateful config file as more > or less opaque. It shouldn't be visible to end user. > > In the non-managed case, users should interact directly with the > config file. Doesn't the stateful non-config file becomes a failure point? It has to be on shared and redundant storage? To me, it seems a lot easier to require management to replay any commands that hadn't been acknowledged (due to management failure), or to query qemu as to its current state (if it is alive). Management already needs stable, redundant config storage anyway (often a database). >> Can you give more examples? >> >> I think I demonstrated that hot-plug can be solved via the existing >> interfaces. > > Sure. CMOS settings right now are not persisted across reboot. That is handled easily with an NVRAM disk image. > Guest initiated activities like IDE or PCI eject are tricky to persist > correctly within a management tool. > > We could add events for all of this things but it's all racy since > events are posted. If we have a stateful config file, we can make all > of these things non-racy and post an event that the config has > changed. If there's a crash, the management tool can read the config > on startup to catch up on missed events. If qemu crashes, these events are meaningless. If management crashes, it has to query qemu for all state that it wants to keep track of via events. > > I think the nature of a posted event management interface is such that > we need a stateful config that persists across QEMU invocations. I'm not convinced, and I think making qemu manage even more state creates more problems.
On 23 February 2011 16:01, Anthony Liguori <anthony@codemonkey.ws> wrote: > Yes. Â I view the command line as optional. Â To me, this is the ideal > interaction: > > 1) start qemu with an empty stateful config file > > 2) issue monitor commands to create all devices and backends > > 3) the stateful config file totally captures the state of all of the issued > QMP commands. Â The management tool can relaunch the guest just by passing > the stateful config file to QEMU. > > 4) when the management tool needs to "extract" a config file, it can read > the stateful config (through the monitor) and generate it's own config. > > 5) the management tool should treat the stateful config file as more or less > opaque. Â It shouldn't be visible to end user. > > In the non-managed case, users should interact directly with the config > file. For the typical TCG use case, that sounds a whole lot worse than the current situation of "just run qemu with a suitable command line"... -- PMM
On 02/23/2011 10:14 AM, Avi Kivity wrote: > On 02/23/2011 06:01 PM, Anthony Liguori wrote: >> >>> >>> Qemu does keep state. Currently only images, but in theory also the >>> on-board NVRAM. >> >> Yeah, this is a good example of an area where a "stateful config >> file" would be useful. I like the idea of storing this sort of thing >> in a text file with a config structure because a user certainly wants >> to be able to specify the boot order. Being able to tweak this kind >> of stuff adds a lot of interesting capabilities. > > My preference would be a binary file (a disk image, in fact), with a > tool to play with the known fields. It allows a management tool to > reuse its storage stack. I'm not sure yet what I think is best. To be honest, I was planning on tackling this for 0.16 and focusing on the QMP side of the world for 0.15 so I don't have as well formed opinions yet. >> Well specifically, it has to ask QEMU and QEMU can tell it the >> current state via a nice structured data format over QMP. It's a >> hell of a lot easier than the management tool trying to do this >> outside of QEMU. > > So, if qemu crashes, the management tool has to start it up to find > out what the current state is. Depends on how opaque we make the state file. I've been thinking a simple ini syntax with a well supported set of keys. In that case, a management tool can read it without starting QEMU. >> >>> But at which points should it parse it? >> >> I was thinking that we should post events whenever we change the >> stateful config. That would let the management tool have a mechanism >> for determining when settings have been changed. Of course, if the >> management tool crashes, it should re-read at startup. >> >>> I don't think it's reasonable to have three different ways to >>> interact with qemu, all needed: the command line, reading and >>> writing the stateful config file, and the monitor. I'd rather push >>> for starting qemu with a blank guest and assembling (cold-plugging) >>> all the hardware via the monitor before starting the guest. >> >> Yes. I view the command line as optional. To me, this is the ideal >> interaction: >> >> 1) start qemu with an empty stateful config file >> >> 2) issue monitor commands to create all devices and backends >> >> 3) the stateful config file totally captures the state of all of the >> issued QMP commands. The management tool can relaunch the guest just >> by passing the stateful config file to QEMU. >> >> 4) when the management tool needs to "extract" a config file, it can >> read the stateful config (through the monitor) and generate it's own >> config. >> >> 5) the management tool should treat the stateful config file as more >> or less opaque. It shouldn't be visible to end user. >> >> In the non-managed case, users should interact directly with the >> config file. > > Doesn't the stateful non-config file becomes a failure point? It has > to be on shared and redundant storage? It depends on what your availability model is and how frequently your management tool backs up the config. As of right now, we have a pretty glaring reliability hole here so adding a stateful "non-config" can only improve things. > To me, it seems a lot easier to require management to replay any > commands that hadn't been acknowledged (due to management failure), or > to query qemu as to its current state (if it is alive). You still have the race condition around guest initiated events like eject. Unless you have an acknowledged event from a management tool (which we can't do in QMP today) whereas you don't complete the guest initiated eject operation until management ack's it, we need to store that state ourself. I don't like the idea of making a management tool such an integral part of the functional paths. Not having a stateful config file also means that this problem isn't solved in any form without a really sophisticated management stack. I'm a big fan of being robust in the face of not-so sophisticated management tools. > Management already needs stable, redundant config storage anyway > (often a database). > >>> Can you give more examples? >>> >>> I think I demonstrated that hot-plug can be solved via the existing >>> interfaces. >> >> Sure. CMOS settings right now are not persisted across reboot. > > That is handled easily with an NVRAM disk image. > >> Guest initiated activities like IDE or PCI eject are tricky to >> persist correctly within a management tool. >> >> We could add events for all of this things but it's all racy since >> events are posted. If we have a stateful config file, we can make >> all of these things non-racy and post an event that the config has >> changed. If there's a crash, the management tool can read the config >> on startup to catch up on missed events. > > If qemu crashes, these events are meaningless. If management crashes, > it has to query qemu for all state that it wants to keep track of via > events. Think power failure, not qemu crash. In the event of a power failure, any hardware change initiated by the guest ought to be consistent with when the guest has restarted. If you eject the CDROM tray and then lose power, its still ejected after the power comes back on. >> >> I think the nature of a posted event management interface is such >> that we need a stateful config that persists across QEMU invocations. > > I'm not convinced, and I think making qemu manage even more state > creates more problems. Well this patch series is making qemu management more state. The only question is whether we do this as a one-off mechanism or whether we architect a general mechanism to do it. How much state we store can always be up for discussion but I think it's undeniable that we need to store more state than we're storing today (none). Regards, Anthony Liguori
On 02/23/2011 10:17 AM, Peter Maydell wrote: > On 23 February 2011 16:01, Anthony Liguori<anthony@codemonkey.ws> wrote: > >> Yes. I view the command line as optional. To me, this is the ideal >> interaction: >> >> 1) start qemu with an empty stateful config file >> >> 2) issue monitor commands to create all devices and backends >> >> 3) the stateful config file totally captures the state of all of the issued >> QMP commands. The management tool can relaunch the guest just by passing >> the stateful config file to QEMU. >> >> 4) when the management tool needs to "extract" a config file, it can read >> the stateful config (through the monitor) and generate it's own config. >> >> 5) the management tool should treat the stateful config file as more or less >> opaque. It shouldn't be visible to end user. >> >> In the non-managed case, users should interact directly with the config >> file. >> > For the typical TCG use case, that sounds a whole lot worse than > the current situation of "just run qemu with a suitable command > line"... > This is only an option. Running qemu with no stateful config and just using command line parameters will continue to be well supported. Internally, no stateful config is essentially equivalent to using a stateful config in -snapshot mode where all of the changes are just lost. The command line parameters will all eventually be implemented more or less in terms of internal QMP commands. This is another reason for the QAPI refactoring. -device handling (and all iterations of it) just becomes a call to qmp_device_add(). Regards, Anthony Liguori > -- PMM > >
On 02/23/2011 06:28 PM, Anthony Liguori wrote: > >>> Well specifically, it has to ask QEMU and QEMU can tell it the >>> current state via a nice structured data format over QMP. It's a >>> hell of a lot easier than the management tool trying to do this >>> outside of QEMU. >> >> So, if qemu crashes, the management tool has to start it up to find >> out what the current state is. > > Depends on how opaque we make the state file. I've been thinking a > simple ini syntax with a well supported set of keys. In that case, a > management tool can read it without starting QEMU. Then the management stack has to worry about yet another way of interacting via qemu. I'd like to limit it to the monitor. >> >> Doesn't the stateful non-config file becomes a failure point? It has >> to be on shared and redundant storage? > > It depends on what your availability model is and how frequently your > management tool backs up the config. As of right now, we have a > pretty glaring reliability hole here so adding a stateful "non-config" > can only improve things. I think the solutions I pointed out close the hole with the existing interfaces. > >> To me, it seems a lot easier to require management to replay any >> commands that hadn't been acknowledged (due to management failure), >> or to query qemu as to its current state (if it is alive). > > You still have the race condition around guest initiated events like > eject. Unless you have an acknowledged event from a management tool > (which we can't do in QMP today) whereas you don't complete the guest > initiated eject operation until management ack's it, we need to store > that state ourself. I don't see why. If management crashes, it queries the eject state when it reconnects to qemu. If qemu crashes, the eject state is lost, but that is fine. My CD-ROM drive tray pulls itself in when the machine is started. > > I don't like the idea of making a management tool such an integral > part of the functional paths. I agree that we don't want qemu to wait on the management stack any more than necessary. > Not having a stateful config file also means that this problem isn't > solved in any form without a really sophisticated management stack. > I'm a big fan of being robust in the face of not-so sophisticated > management tools. You're introducing the need for additional code in the management layer, the care and feeding for the stateful non-config file. >> If qemu crashes, these events are meaningless. If management >> crashes, it has to query qemu for all state that it wants to keep >> track of via events. > > Think power failure, not qemu crash. In the event of a power failure, > any hardware change initiated by the guest ought to be consistent with > when the guest has restarted. If you eject the CDROM tray and then > lose power, its still ejected after the power comes back on. Not on all machines. Let's list guest state which is independent of power. That would be wither NVRAM of various types, or physical alterations. CD-ROM eject is one. Are there others? > >>> >>> I think the nature of a posted event management interface is such >>> that we need a stateful config that persists across QEMU invocations. >> >> I'm not convinced, and I think making qemu manage even more state >> creates more problems. > > Well this patch series is making qemu management more state. The only > question is whether we do this as a one-off mechanism or whether we > architect a general mechanism to do it. > > How much state we store can always be up for discussion but I think > it's undeniable that we need to store more state than we're storing > today (none). I think my solution (multiplexing block format driver) fits the requirements for live-copy perfectly. In fact it has a name - it's a RAID-1 driver started in degraded mode. It could be useful other use cases.
Avi Kivity <avi@redhat.com> writes: > On 02/23/2011 04:35 PM, Anthony Liguori wrote: >> On 02/23/2011 07:01 AM, Avi Kivity wrote: >>> On 02/23/2011 01:14 AM, Anthony Liguori wrote: >>>> >>>> -drive already ties into the qemuopts infrastructure and we have >>>> readconfig and writeconfig. I don't think we're missing any major >>>> pieces to do this in a more proper fashion. >>> >>> The problem with qemu config files is that it splits the >>> authoritative source of where images are stored into two. Is it in >>> the management tool's database or is it in qemu's config file? >> >> I like to use the phrase "stateful config file". To me, it's just a >> database for QEMU to persist data about the VM. It's the only way >> for QEMU to make certain transactions atomic in the face of QEMU >> crashing. >> >> The user visible config file is a totally different concept. A >> management tool launches QEMU and tells it where to keep it's state >> database. The management application may prepopulate the state >> database or it may just use an empty file. > > In that case the word 'config' is misleading. To me, it implies that > the user configures something, and qemu reads it, not something mostly > internal to qemu. > > Qemu does keep state. Currently only images, but in theory also the > on-board NVRAM. > >> >> QEMU uses the state database to store information that is created >> dynamically. For instance, devices added through device_add. A >> device added via -device wouldn't necessary get added to the state >> database. >> >> Practically speaking, it let's you invoke QEMU with a fixed command >> line, while still using the monitor to make changes that would >> otherwise require the command line being updated. > > Then the invoker quickly loses track of what the actual state is. It > can't just remember which commands it issued (presumably in response > to the user updating user visible state). It has to parse the > stateful config file qemu outputs. But at which points should it > parse it? > > I don't think it's reasonable to have three different ways to interact > with qemu, all needed: the command line, reading and writing the > stateful config file, and the monitor. I'd rather push for starting > qemu with a blank guest and assembling (cold-plugging) all the > hardware via the monitor before starting the guest. Exactly. And qdev has brought his within our reach. [...]
On Wed, Feb 23, 2011 at 03:01:14PM +0200, Avi Kivity wrote: > On 02/23/2011 01:14 AM, Anthony Liguori wrote: > > > >-drive already ties into the qemuopts infrastructure and we have > >readconfig and writeconfig. I don't think we're missing any major > >pieces to do this in a more proper fashion. > > The problem with qemu config files is that it splits the > authoritative source of where images are stored into two. Is it in > the management tool's database or is it in qemu's config file? > > For the problem at hand, one solution is to make qemu stop after the > copy, and then management can issue an additional command to > rearrange the disk and resume the guest. A drawback here is that if > management dies, the guest is stopped until it restarts. We also > make management latency guest visible, even if it doesn't die at an > inconvenient place. > > An alternative approach is to have the copy be performed by a new > layered block format driver: > > - create a new image, type = live-copy, containing three pieces of > information > - source image > - destination image > - copy state (initially nothing is copied) > - tell qemu switch to the new image > - qemu starts copying, updates copy state as needed > - copy finishes, event is emitted; reads and writes still serviced > - management receives event, switches qemu to destination image > - management removes live-copy image > > If management dies while this is happening, it can simply query the > state of the copy. > Similarly, if qemu dies, the copy state is persistent (could be 0/1 or > real range of blocks). You don't know if a given block is uptodate or not without the dirty bitmap. So unless you also keep track of dirty log somehow, this is meaningless. So a commit file as proposed indicates copy state (in 0/1 fashion). The difference in your proposal is that such information is stored inside a special purpose image format?
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote: > Index: qemu/qerror.h > =================================================================== > --- qemu.orig/qerror.h > +++ qemu/qerror.h > @@ -171,4 +171,13 @@ QError *qobject_to_qerror(const QObject > #define QERR_VNC_SERVER_FAILED \ > "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }" > > +#define QERR_BLOCKCOPY_IN_PROGRESS \ > + "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }" > The caller already knows the device name by virtue of issuing the command so this is redundant. I think a better approach would be a QERR_IN_PROGRESS 'data': { 'operation': %s } For block copy, we'd say QERR_IN_PROGRESS("block copy"). > + > +#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \ > + "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }" > + > +#define QERR_MIGRATION_IN_PROGRESS \ > + "{ 'class': 'MigrationInProgress', 'data': {} }" > Then QERR_IN_PROGRESS("live migration") > #endif /* QERROR_H */ > Index: qemu/qmp-commands.hx > =================================================================== > --- qemu.orig/qmp-commands.hx > +++ qemu/qmp-commands.hx > @@ -581,6 +581,75 @@ Example: > EQMP > > { > + .name = "block_copy", > + .args_type = "device:s,filename:s,commit_filename:s?,inc:-i", > + .params = "device filename [commit_filename] [-i]", > + .help = "live block copy device to image" > + "\n\t\t\t optional commit filename " > + "\n\t\t\t -i for incremental copy " > + "(base image shared between src and destination)", > + .user_print = monitor_user_noop, > + .mhandler.cmd_new = do_bdrv_copy, > + }, > + > +SQMP > +block-copy > +------- > + > +Live block copy. > I'm not sure copy really describes what we're doing here. Maybe migrate-block? > +Arguments: > + > +- "device": device name (json-string) > +- "filename": target image filename (json-string) > Is this a created image? Is this an image to create? To future proof for blockdev, we should make this argument optional and if it's not specified throw an error about missing argument. This let's us introduce an optional blockdev argument such that we can use a blockdev name. > +- "commit_filename": target commit filename (json-string, optional) > I think we should drop this. > +- "inc": incremental disk copy (json-bool, optional) > Let's use the full name (incremental) and we need to describe in detail what the semantics of this are. Will it scan the target block device to identify identical blocks? > +Example: > + > +-> { "execute": "block_copy", > + "arguments": { "device": "ide0-hd1", > + "filename": "/mnt/new-disk.img", > + "commit_filename: "/mnt/commit-new-disk.img" > + } } > + > +<- { "return": {} } > + > +Notes: > + > +(1) The 'query-block-copy' command should be used to check block copy progress > + and final result (this information is provided by the 'status' member) > +(2) Boolean argument "inc" defaults to false > We should also document error semantics. What errors are expected and why? > +EQMP > + > + { > + .name = "block_copy_cancel", > + .args_type = "device:s", > + .params = "device", > + .help = "cancel live block copy", > + .user_print = monitor_user_noop, > + .mhandler.cmd_new = do_bdrv_copy_cancel, > + }, > + > +SQMP > +block_copy_cancel > +-------------- > + > +Cancel live block copy. > + > +Arguments: > + > +- device: device name (json-string) > + > +Example: > + > +-> { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } } > +<- { "return": {} } > cancel-block-migration? What happens if: - No block copy is active anymore (it's completed) - A block copy was never started - device refers to a device that no longer exists - device refers to a device with no media present If this command succeeds, what the state of target? If I resume the operation with the incremental flag set, will it Just Work? > + > +EQMP > + > + { > .name = "netdev_add", > .args_type = "netdev:O", > .params = "[user|tap|socket],id=str[,prop=value][,...]", > @@ -1740,6 +1809,44 @@ Examples: > EQMP > > SQMP > +query-block-copy > +------------- > + > +Live block copy status. > + > +Each block copy instance information is stored in a json-object and the returned > +value is a json-array of all instances. > + > +Each json-object contains the following: > + > +- "device": device name (json-string) > +- "status": block copy status (json-string) > + - Possible values: "active", "failed", "completed" > +- "info": A json-object with the statistics information, if status is "active": > + - "percentage": percentage completed (json-int) > So let's fold this into query-block And let's also report the statistics as MigrationStats which is the format: { 'MigrationStats': {'transferred': 'int', 'remaining': 'int', 'total': 'int' } } So I'd propose changing: { 'BlockDeviceStats': {'rd_bytes': 'int', 'wr_bytes': 'int', 'rd_operations': 'int', 'wr_operations': 'int', 'wr_highest_offset': 'int' } } { 'BlockStats': {'*device': 'str', 'stats': 'BlockDeviceStats', '*parent': 'BlockStats'} } [ 'query-blockstats', {}, {}, ['BlockStats'] ] To: { 'BlockMigrationStatus': [ 'active', 'inactive', 'cancelled' ] } { 'BlockMigrationInfo': {'status': 'BlockMigrationStatus', 'stats': 'BlockMigrationStats', '*target_filename': 'str' } } { 'BlockDeviceStats': {'rd_bytes': 'int', 'wr_bytes': 'int', 'rd_operations': 'int', 'wr_operations': 'int', 'wr_highest_offset': 'int', '*migration': 'BlockMigrationInfo' } } { 'BlockStats': {'*device': 'str', 'stats': 'BlockDeviceStats', '*parent': 'BlockStats'} } [ 'query-blockstats', {}, {}, ['BlockStats'] ] Regards, Anthony Liguori > + > +Example: > + > +Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed: > + > +-> { "execute": "query-block-copy" } > +<- { > + "return":[ > + {"device":"ide1-hd0", > + "status":"active", > + "info":{ > + "percentage":23, > + } > + }, > + {"device":"ide1-hd1", > + "status":"failed" > + } > + ] > + } > + > +EQMP > + > +SQMP > query-balloon > ------------- > > Index: qemu/Makefile.objs > =================================================================== > --- qemu.orig/Makefile.objs > +++ qemu/Makefile.objs > @@ -98,7 +98,7 @@ common-obj-y += buffered_file.o migratio > common-obj-y += qemu-char.o savevm.o #aio.o > common-obj-y += msmouse.o ps2.o > common-obj-y += qdev.o qdev-properties.o > -common-obj-y += block-migration.o > +common-obj-y += block-migration.o block-copy.o > common-obj-y += pflib.o > > common-obj-$(CONFIG_BRLAPI) += baum.o > Index: qemu/sysemu.h > =================================================================== > --- qemu.orig/sysemu.h > +++ qemu/sysemu.h > @@ -46,6 +46,7 @@ void qemu_del_vm_change_state_handler(VM > #define VMSTOP_SAVEVM 6 > #define VMSTOP_LOADVM 7 > #define VMSTOP_MIGRATE 8 > +#define VMSTOP_BLOCKCOPY 9 > > void vm_start(void); > void vm_stop(int reason); > > >
On 02/23/2011 11:26 AM, Markus Armbruster wrote: >> I don't think it's reasonable to have three different ways to interact >> with qemu, all needed: the command line, reading and writing the >> stateful config file, and the monitor. I'd rather push for starting >> qemu with a blank guest and assembling (cold-plugging) all the >> hardware via the monitor before starting the guest. >> > Exactly. And qdev has brought his within our reach. > Actually, QMP is the star of the show here, not qdev. The way we get here is by incrementally converting the option handling to be qmp calls. For instance, instead of: case QEMU_OPTION_name: qemu_name = optarg; break; We do: case QEMU_OPTION_name: qmp_set_name(optarg, NULL); break; When we can compile vl.c with nothing more than QMP dependencies, we've achieved the goals here. This will mean adding a lot of new QMP commands. There are some command line options that must be handled before the machine is initialized and QMP is normally run. For 0.16, we can introduce a new QMP mode whereas the event loop gets to run before doing machine init and explicit create_machine() command is needed. This is the final bit that will be needed to realize this goal. A lot of good things come out of this. Quite a lot of these new commands don't strictly need to run before machine init (like -name) which means we'll get the ability to change a lot of parameters without rebooting the guest which couldn't be changed before. And this is all incremental stuff that can be done in parallel of the QAPI work. We just need to do the work of adding the function calls (or function call wrappers where appropriate). Regards, Anthony Liguori > [...] > >
On 02/23/2011 11:18 AM, Avi Kivity wrote: > On 02/23/2011 06:28 PM, Anthony Liguori wrote: >> >>>> Well specifically, it has to ask QEMU and QEMU can tell it the >>>> current state via a nice structured data format over QMP. It's a >>>> hell of a lot easier than the management tool trying to do this >>>> outside of QEMU. >>> >>> So, if qemu crashes, the management tool has to start it up to find >>> out what the current state is. >> >> Depends on how opaque we make the state file. I've been thinking a >> simple ini syntax with a well supported set of keys. In that case, a >> management tool can read it without starting QEMU. > > Then the management stack has to worry about yet another way of > interacting via qemu. { 'StateItem': { 'key': 'str', 'value': 'str' } } { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ 'StateItem' ] } } { 'StateInfo': { 'sections': [ 'StateSection' ] } } { 'query-state', {}, {}, 'StateInfo' } A management tool never need to worry about anything other than this command if it so chooses. If we have the pre-machine init mode for 0.16, then this can even be used to inspect state without running a guest. The fact that the state is visible in the filesystem is an implementation detail. > I'd like to limit it to the monitor. > >>> >>> Doesn't the stateful non-config file becomes a failure point? It >>> has to be on shared and redundant storage? >> >> It depends on what your availability model is and how frequently your >> management tool backs up the config. As of right now, we have a >> pretty glaring reliability hole here so adding a stateful >> "non-config" can only improve things. > > I think the solutions I pointed out close the hole with the existing > interfaces. It doesn't work for eject unless you interpose an acknowledged event. Ultimately, this is a simple problem. If you want reliability, we either need symmetric RPCs so that the device model can call (and wait) to the management layer to acknowledge a change or QEMU can post an event to the management layer, and maintain the state in a reliable fashion. >> >>> To me, it seems a lot easier to require management to replay any >>> commands that hadn't been acknowledged (due to management failure), >>> or to query qemu as to its current state (if it is alive). >> >> You still have the race condition around guest initiated events like >> eject. Unless you have an acknowledged event from a management tool >> (which we can't do in QMP today) whereas you don't complete the guest >> initiated eject operation until management ack's it, we need to store >> that state ourself. > > I don't see why. > > If management crashes, it queries the eject state when it reconnects > to qemu. > If qemu crashes, the eject state is lost, but that is fine. My CD-ROM > drive tray pulls itself in when the machine is started. Pick any of a number of possible events that change the machine's state. We can wave our hands at some things saying they don't matter and do one off solutions for others, or we can just have a robust way of handling this consistently. >> >> I don't like the idea of making a management tool such an integral >> part of the functional paths. > > I agree that we don't want qemu to wait on the management stack any > more than necessary. > >> Not having a stateful config file also means that this problem isn't >> solved in any form without a really sophisticated management stack. >> I'm a big fan of being robust in the face of not-so sophisticated >> management tools. > > You're introducing the need for additional code in the management > layer, the care and feeding for the stateful non-config file. If a management layer ignores the stateful non-config file, as you like to call it, it'll get the same semantics it has today. I think managing a single thing is a whole lot easier than managing an NVRAM file, a block migration layering file, and all of the future things we're going to add once we decide they are important too. >>> If qemu crashes, these events are meaningless. If management >>> crashes, it has to query qemu for all state that it wants to keep >>> track of via events. >> >> Think power failure, not qemu crash. In the event of a power >> failure, any hardware change initiated by the guest ought to be >> consistent with when the guest has restarted. If you eject the CDROM >> tray and then lose power, its still ejected after the power comes >> back on. > > Not on all machines. > > Let's list guest state which is independent of power. That would be > wither NVRAM of various types, or physical alterations. CD-ROM eject > is one. Are there others? Any indirect qemu state. Block migration is an example, but other examples would be VNC server information (like current password), WCE setting (depending on whether we modelled eeprom for the drivers), and persisted device settings (lots of devices have eeprom these days). >> >>>> >>>> I think the nature of a posted event management interface is such >>>> that we need a stateful config that persists across QEMU invocations. >>> >>> I'm not convinced, and I think making qemu manage even more state >>> creates more problems. >> >> Well this patch series is making qemu management more state. The >> only question is whether we do this as a one-off mechanism or whether >> we architect a general mechanism to do it. >> >> How much state we store can always be up for discussion but I think >> it's undeniable that we need to store more state than we're storing >> today (none). > > I think my solution (multiplexing block format driver) fits the > requirements for live-copy perfectly. In fact it has a name - it's a > RAID-1 driver started in degraded mode. It could be useful other use > cases. It feels a bit awkward to me to be honest. Regards, Anthony Liguori
On Wed, Feb 23, 2011 at 02:18:31PM -0600, Anthony Liguori wrote: > On 02/23/2011 11:18 AM, Avi Kivity wrote: > >On 02/23/2011 06:28 PM, Anthony Liguori wrote: > >> > >>>>Well specifically, it has to ask QEMU and QEMU can tell it > >>>>the current state via a nice structured data format over > >>>>QMP. It's a hell of a lot easier than the management tool > >>>>trying to do this outside of QEMU. > >>> > >>>So, if qemu crashes, the management tool has to start it up to > >>>find out what the current state is. > >> > >>Depends on how opaque we make the state file. I've been > >>thinking a simple ini syntax with a well supported set of keys. > >>In that case, a management tool can read it without starting > >>QEMU. > > > >Then the management stack has to worry about yet another way of > >interacting via qemu. > > { 'StateItem': { 'key': 'str', 'value': 'str' } } > { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ > 'StateItem' ] } } > { 'StateInfo': { 'sections': [ 'StateSection' ] } } > > { 'query-state', {}, {}, 'StateInfo' } > > A management tool never need to worry about anything other than this > command if it so chooses. If we have the pre-machine init mode for > 0.16, then this can even be used to inspect state without running a > guest. > > The fact that the state is visible in the filesystem is an > implementation detail. > > > I'd like to limit it to the monitor. > > > >>> > >>>Doesn't the stateful non-config file becomes a failure point? > >>>It has to be on shared and redundant storage? > >> > >>It depends on what your availability model is and how frequently > >>your management tool backs up the config. As of right now, we > >>have a pretty glaring reliability hole here so adding a stateful > >>"non-config" can only improve things. > > > >I think the solutions I pointed out close the hole with the > >existing interfaces. > > It doesn't work for eject unless you interpose an acknowledged > event. Ultimately, this is a simple problem. If you want > reliability, we either need symmetric RPCs so that the device model > can call (and wait) to the management layer to acknowledge a change > or QEMU can post an event to the management layer, and maintain the > state in a reliable fashion. > > >> > >>>To me, it seems a lot easier to require management to replay > >>>any commands that hadn't been acknowledged (due to management > >>>failure), or to query qemu as to its current state (if it is > >>>alive). > >> > >>You still have the race condition around guest initiated events > >>like eject. Unless you have an acknowledged event from a > >>management tool (which we can't do in QMP today) whereas you > >>don't complete the guest initiated eject operation until > >>management ack's it, we need to store that state ourself. > > > >I don't see why. > > > >If management crashes, it queries the eject state when it > >reconnects to qemu. > >If qemu crashes, the eject state is lost, but that is fine. My > >CD-ROM drive tray pulls itself in when the machine is started. > > Pick any of a number of possible events that change the machine's > state. We can wave our hands at some things saying they don't > matter and do one off solutions for others, or we can just have a > robust way of handling this consistently. > > >> > >>I don't like the idea of making a management tool such an > >>integral part of the functional paths. > > > >I agree that we don't want qemu to wait on the management stack > >any more than necessary. > > > >>Not having a stateful config file also means that this problem > >>isn't solved in any form without a really sophisticated > >>management stack. I'm a big fan of being robust in the face of > >>not-so sophisticated management tools. > > > >You're introducing the need for additional code in the management > >layer, the care and feeding for the stateful non-config file. > > If a management layer ignores the stateful non-config file, as you > like to call it, it'll get the same semantics it has today. I think > managing a single thing is a whole lot easier than managing an NVRAM > file, a block migration layering file, and all of the future things > we're going to add once we decide they are important too. > > >>>If qemu crashes, these events are meaningless. If management > >>>crashes, it has to query qemu for all state that it wants to > >>>keep track of via events. > >> > >>Think power failure, not qemu crash. In the event of a power > >>failure, any hardware change initiated by the guest ought to be > >>consistent with when the guest has restarted. If you eject the > >>CDROM tray and then lose power, its still ejected after the > >>power comes back on. > > > >Not on all machines. > > > >Let's list guest state which is independent of power. That would > >be wither NVRAM of various types, or physical alterations. CD-ROM > >eject is one. Are there others? > > Any indirect qemu state. Block migration is an example, but other > examples would be VNC server information (like current password), > WCE setting (depending on whether we modelled eeprom for the > drivers), and persisted device settings (lots of devices have eeprom > these days). Agreed. Why a separate location for this "stateful non-config" section, however? The state in question (backing image property, device presence, VNC info, etc) is already represented in either host or guest configuration, so why not simply expose that?
On 02/23/2011 02:44 PM, Marcelo Tosatti wrote: > >> Any indirect qemu state. Block migration is an example, but other >> examples would be VNC server information (like current password), >> WCE setting (depending on whether we modelled eeprom for the >> drivers), and persisted device settings (lots of devices have eeprom >> these days). >> > Agreed. > > Why a separate location for this "stateful non-config" section, however? > > The state in question (backing image property, device presence, VNC > info, etc) is already represented in either host or guest configuration, > so why not simply expose that? > Hrm, are you suggesting that the "stateful non-config" be hidden directly from the user such that only existing monitor interfaces are used to query it's contents? I'll have to think about that a bit. Obviously, the existing commands would still be authoritative even with a query-state command. I think the only question is whether there's value in making this totally opaque. Regards, Anthony Liguori
Who I can do it?
Anthony Liguori <anthony@codemonkey.ws> writes: > On 02/23/2011 11:18 AM, Avi Kivity wrote: >> On 02/23/2011 06:28 PM, Anthony Liguori wrote: >>> >>>>> Well specifically, it has to ask QEMU and QEMU can tell it the >>>>> current state via a nice structured data format over QMP. It's a >>>>> hell of a lot easier than the management tool trying to do this >>>>> outside of QEMU. >>>> >>>> So, if qemu crashes, the management tool has to start it up to >>>> find out what the current state is. >>> >>> Depends on how opaque we make the state file. I've been thinking a >>> simple ini syntax with a well supported set of keys. In that case, >>> a management tool can read it without starting QEMU. >> >> Then the management stack has to worry about yet another way of >> interacting via qemu. > > { 'StateItem': { 'key': 'str', 'value': 'str' } } > { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ > StateItem' ] } } > { 'StateInfo': { 'sections': [ 'StateSection' ] } } > > { 'query-state', {}, {}, 'StateInfo' } > > A management tool never need to worry about anything other than this > command if it so chooses. If we have the pre-machine init mode for > 0.16, then this can even be used to inspect state without running a > guest. > > The fact that the state is visible in the filesystem is an > implementation detail. > >> I'd like to limit it to the monitor. >> >>>> >>>> Doesn't the stateful non-config file becomes a failure point? It >>>> has to be on shared and redundant storage? >>> >>> It depends on what your availability model is and how frequently >>> your management tool backs up the config. As of right now, we have >>> a pretty glaring reliability hole here so adding a stateful >>> "non-config" can only improve things. >> >> I think the solutions I pointed out close the hole with the existing >> interfaces. > > It doesn't work for eject unless you interpose an acknowledged event. > Ultimately, this is a simple problem. If you want reliability, we > either need symmetric RPCs so that the device model can call (and > wait) to the management layer to acknowledge a change or QEMU can post > an event to the management layer, and maintain the state in a reliable > fashion. > >>> >>>> To me, it seems a lot easier to require management to replay any >>>> commands that hadn't been acknowledged (due to management >>>> failure), or to query qemu as to its current state (if it is >>>> alive). >>> >>> You still have the race condition around guest initiated events >>> like eject. Unless you have an acknowledged event from a >>> management tool (which we can't do in QMP today) whereas you don't >>> complete the guest initiated eject operation until management ack's >>> it, we need to store that state ourself. >> >> I don't see why. >> >> If management crashes, it queries the eject state when it reconnects >> to qemu. >> If qemu crashes, the eject state is lost, but that is fine. My >> CD-ROM drive tray pulls itself in when the machine is started. > > Pick any of a number of possible events that change the machine's > state. We can wave our hands at some things saying they don't matter > and do one off solutions for others, or we can just have a robust way > of handling this consistently. What machine state is to be preserved across crashes? Maybe I'm naive, but I can't think of anything but non-volatile device memory, e.g. disks, RTC CMOS, EEPROM. I can't quite see why we need conceptually different mechanisms for disks and all the rest. [...]
On 02/23/2011 10:18 PM, Anthony Liguori wrote: >> Then the management stack has to worry about yet another way of >> interacting via qemu. > > > { 'StateItem': { 'key': 'str', 'value': 'str' } } > { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ > 'StateItem' ] } } > { 'StateInfo': { 'sections': [ 'StateSection' ] } } > > { 'query-state', {}, {}, 'StateInfo' } > > A management tool never need to worry about anything other than this > command if it so chooses. If we have the pre-machine init mode for > 0.16, then this can even be used to inspect state without running a > guest. So we have yet another information tree. If we store the cd-rom eject state here, then we need to make an association between the device path of the cd-rom, and the StateItem key. Far better to store it in the device itself. For example, we could make a layered block format driver that stores the eject state and a "backing file" containing the actual media. Eject and media change would be recorded in the block format driver's state. You could then hot-unplug a USB cd-writer and hot-plug it back into a different guest, implementing a virtual sneakernet. > > The fact that the state is visible in the filesystem is an > implementation detail. A detail that has to be catered for by the management stack - it has to provide a safe place for it, back it up, etc. > >> I'd like to limit it to the monitor. >> >>>> >>>> Doesn't the stateful non-config file becomes a failure point? It >>>> has to be on shared and redundant storage? >>> >>> It depends on what your availability model is and how frequently >>> your management tool backs up the config. As of right now, we have >>> a pretty glaring reliability hole here so adding a stateful >>> "non-config" can only improve things. >> >> I think the solutions I pointed out close the hole with the existing >> interfaces. > > It doesn't work for eject unless you interpose an acknowledged event. > Ultimately, this is a simple problem. If you want reliability, we > either need symmetric RPCs so that the device model can call (and > wait) to the management layer to acknowledge a change or QEMU can post > an event to the management layer, and maintain the state in a reliable > fashion. I don't see why it doesn't work. Please explain. >>> You still have the race condition around guest initiated events like >>> eject. Unless you have an acknowledged event from a management tool >>> (which we can't do in QMP today) whereas you don't complete the >>> guest initiated eject operation until management ack's it, we need >>> to store that state ourself. >> >> I don't see why. >> >> If management crashes, it queries the eject state when it reconnects >> to qemu. >> If qemu crashes, the eject state is lost, but that is fine. My >> CD-ROM drive tray pulls itself in when the machine is started. > > Pick any of a number of possible events that change the machine's > state. We can wave our hands at some things saying they don't matter > and do one off solutions for others, or we can just have a robust way > of handling this consistently. Both block live copy and cd-rom eject state can be solved with layered block format drivers. I don't think a central place for random data makes sense. State belongs near the device that maintains it, esp. if the device is hot-pluggable, so it's easy to associate the state with the device. >> >> You're introducing the need for additional code in the management >> layer, the care and feeding for the stateful non-config file. > > If a management layer ignores the stateful non-config file, as you > like to call it, it'll get the same semantics it has today. I think > managing a single thing is a whole lot easier than managing an NVRAM > file, a block migration layering file, and all of the future things > we're going to add once we decide they are important too. I disagree. Storing NVRAM as a disk image is a simple extension of existing management tools. Block live-copy and cd-rom eject state also make sense as per-image state if you take hotunplug and hotplug into account. > >>>> If qemu crashes, these events are meaningless. If management >>>> crashes, it has to query qemu for all state that it wants to keep >>>> track of via events. >>> >>> Think power failure, not qemu crash. In the event of a power >>> failure, any hardware change initiated by the guest ought to be >>> consistent with when the guest has restarted. If you eject the >>> CDROM tray and then lose power, its still ejected after the power >>> comes back on. >> >> Not on all machines. >> >> Let's list guest state which is independent of power. That would be >> wither NVRAM of various types, or physical alterations. CD-ROM eject >> is one. Are there others? > > Any indirect qemu state. Block migration is an example, but other > examples would be VNC server information (like current password), WCE > setting (depending on whether we modelled eeprom for the drivers), and > persisted device settings (lots of devices have eeprom these days). Device settings should be stored with the devices, not with qemu. Suppose we take the cold-plug on startup via the monitor approach. So we start with a bare machine, cold plug stuff into it. Now qemu has to reconcile the stateful non-config file with the hardware. What if something has changed? A device moved into a different slot? If a network card has eeprom, we can specify it with -device rtl8139,eeprom=id, where id specifies a disk image for the eeprom. >> I think my solution (multiplexing block format driver) fits the >> requirements for live-copy perfectly. In fact it has a name - it's a >> RAID-1 driver started in degraded mode. It could be useful other use >> cases. > > It feels a bit awkward to me to be honest. > Not to me.
On 02/23/2011 07:49 PM, Marcelo Tosatti wrote: > On Wed, Feb 23, 2011 at 03:01:14PM +0200, Avi Kivity wrote: > > On 02/23/2011 01:14 AM, Anthony Liguori wrote: > > > > > >-drive already ties into the qemuopts infrastructure and we have > > >readconfig and writeconfig. I don't think we're missing any major > > >pieces to do this in a more proper fashion. > > > > The problem with qemu config files is that it splits the > > authoritative source of where images are stored into two. Is it in > > the management tool's database or is it in qemu's config file? > > > > For the problem at hand, one solution is to make qemu stop after the > > copy, and then management can issue an additional command to > > rearrange the disk and resume the guest. A drawback here is that if > > management dies, the guest is stopped until it restarts. We also > > make management latency guest visible, even if it doesn't die at an > > inconvenient place. > > > > An alternative approach is to have the copy be performed by a new > > layered block format driver: > > > > - create a new image, type = live-copy, containing three pieces of > > information > > - source image > > - destination image > > - copy state (initially nothing is copied) > > - tell qemu switch to the new image > > - qemu starts copying, updates copy state as needed > > - copy finishes, event is emitted; reads and writes still serviced > > - management receives event, switches qemu to destination image > > - management removes live-copy image > > > > If management dies while this is happening, it can simply query the > > state of the copy. > > Similarly, if qemu dies, the copy state is persistent (could be 0/1 or > > real range of blocks). > > You don't know if a given block is uptodate or not without the dirty > bitmap. So unless you also keep track of dirty log somehow, this is > meaningless. First, you no longer need the dirty bitmap. Since all writes go through the layered block format driver, you know first-hand what is dirty and what isn't. > So a commit file as proposed indicates copy state (in 0/1 fashion). The > difference in your proposal is that such information is stored inside a > special purpose image format? > It could also store the already synced range. The difference is that the file is self contained. You could hot-unplug the image and hot-plug it later (continuing the copy with qemu-img), or live migrate it. In fact I think a qemu RAID-1 driver removes the restriction that you can't live-migrate and live-copy simultaneously.
On Thu, Feb 24, 2011 at 5:41 AM, James Brown <jbrownfirst@gmail.com> wrote:
> Who I can do it?
Please see http://lists.nongnu.org/mailman/listinfo/qemu-devel for
info on how to unsubscribe.
Stefan
Anthony Liguori <anthony@codemonkey.ws> writes: > On 02/23/2011 11:26 AM, Markus Armbruster wrote: >>> I don't think it's reasonable to have three different ways to interact >>> with qemu, all needed: the command line, reading and writing the >>> stateful config file, and the monitor. I'd rather push for starting >>> qemu with a blank guest and assembling (cold-plugging) all the >>> hardware via the monitor before starting the guest. >>> >> Exactly. And qdev has brought his within our reach. >> > > Actually, QMP is the star of the show here, not qdev. > > The way we get here is by incrementally converting the option handling > to be qmp calls. For instance, instead of: > > case QEMU_OPTION_name: > qemu_name = optarg; > break; > > We do: > > case QEMU_OPTION_name: > qmp_set_name(optarg, NULL); > break; > > When we can compile vl.c with nothing more than QMP dependencies, > we've achieved the goals here. This will mean adding a lot of new QMP > commands. > > There are some command line options that must be handled before the > machine is initialized and QMP is normally run. For 0.16, we can > introduce a new QMP mode whereas the event loop gets to run before > doing machine init and explicit create_machine() command is needed. > This is the final bit that will be needed to realize this goal. > > A lot of good things come out of this. Quite a lot of these new > commands don't strictly need to run before machine init (like -name) > which means we'll get the ability to change a lot of parameters > without rebooting the guest which couldn't be changed before. > > And this is all incremental stuff that can be done in parallel of the > QAPI work. We just need to do the work of adding the function calls > (or function call wrappers where appropriate). Well, I wouldn't put "let's create a sane separation between option parsing and machine configuration" under the "QEMU *Monitor* Protocol" flag, but I certainly don't mind you hoisting whatever flag pleases you.
On Wed, Feb 23, 2011 at 03:41:30PM -0600, Anthony Liguori wrote: > On 02/23/2011 02:44 PM, Marcelo Tosatti wrote: > > > >>Any indirect qemu state. Block migration is an example, but other > >>examples would be VNC server information (like current password), > >>WCE setting (depending on whether we modelled eeprom for the > >>drivers), and persisted device settings (lots of devices have eeprom > >>these days). > >Agreed. > > > >Why a separate location for this "stateful non-config" section, however? > > > >The state in question (backing image property, device presence, VNC > >info, etc) is already represented in either host or guest configuration, > >so why not simply expose that? > > Hrm, are you suggesting that the "stateful non-config" be hidden > directly from the user such that only existing monitor interfaces > are used to query it's contents? Yes. So that the guest can be restarted with the same VM parameters. > I'll have to think about that a bit. Obviously, the existing > commands would still be authoritative even with a query-state > command. I think the only question is whether there's value in > making this totally opaque. Why you need a query-state command at all? > > Regards, > > Anthony Liguori >
On 02/24/2011 02:54 AM, Avi Kivity wrote: > On 02/23/2011 10:18 PM, Anthony Liguori wrote: >>> Then the management stack has to worry about yet another way of >>> interacting via qemu. >> >> >> { 'StateItem': { 'key': 'str', 'value': 'str' } } >> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ >> 'StateItem' ] } } >> { 'StateInfo': { 'sections': [ 'StateSection' ] } } >> >> { 'query-state', {}, {}, 'StateInfo' } >> >> A management tool never need to worry about anything other than this >> command if it so chooses. If we have the pre-machine init mode for >> 0.16, then this can even be used to inspect state without running a >> guest. > > So we have yet another information tree. If we store the cd-rom eject > state here, then we need to make an association between the device > path of the cd-rom, and the StateItem key. And this linkage is key. Let's say I launch QEMU with: qemu -cdrom ~/foo.img And then in the monitor, I do: (qemu) eject ide1-cd0 The question is, what command can I now use to launch the same qemu instance? When I think of stateful config, what I really think of is a way to spit out a command line that essentially becomes, "this is how you now launch QEMU". In this case, it would be: qemu -cdrom ~/foo.img -device ide-disk,id=ide1-cd0,drive= Or, we could think of this in terms of: qemu -cdrom ~/foo.img -readconfig foo.cfg Where foo.cfg contained: [device "ide1-cd0"] driver="ide-disk" drive="" So what I'm really suggesting is that we generate foo.cfg whenever monitor commands do things that change the command line and introduce a new option to reflect this, IOW: qemu -cdrom ~/foo.img -config foo.cfg > Far better to store it in the device itself. For example, we could > make a layered block format driver that stores the eject state and a > "backing file" containing the actual media. Eject and media change > would be recorded in the block format driver's state. You could then > hot-unplug a USB cd-writer and hot-plug it back into a different > guest, implementing a virtual sneakernet. I think you're far too hung up on "store it in the device itself". The recipe to create the device model is not intrinsic to the device model. It's an independent thing that's a combination of the command line arguments and any executed monitor commands. Maybe a better way to think about the stateful config file is a mechanism to replay the monitor history. >> >> The fact that the state is visible in the filesystem is an >> implementation detail. > > A detail that has to be catered for by the management stack - it has > to provide a safe place for it, back it up, etc. If it cares for QEMU to preserve state. Today, this all gets thrown away. >> It doesn't work for eject unless you interpose an acknowledged >> event. Ultimately, this is a simple problem. If you want >> reliability, we either need symmetric RPCs so that the device model >> can call (and wait) to the management layer to acknowledge a change >> or QEMU can post an event to the management layer, and maintain the >> state in a reliable fashion. > > I don't see why it doesn't work. Please explain. 1) guest eject 2) qemu posts eject event 3) qemu acknowledges eject to the guest 4) management tool sees eject event and updates guest config There's a race between 3 & 4. It can only be addressed by interposing 4 between 2 and 3 OR making qemu persist this state between 2 and 3 such that the management tool can reliably query it. >>>> You still have the race condition around guest initiated events >>>> like eject. Unless you have an acknowledged event from a >>>> management tool (which we can't do in QMP today) whereas you don't >>>> complete the guest initiated eject operation until management ack's >>>> it, we need to store that state ourself. >>> >>> I don't see why. >>> >>> If management crashes, it queries the eject state when it reconnects >>> to qemu. >>> If qemu crashes, the eject state is lost, but that is fine. My >>> CD-ROM drive tray pulls itself in when the machine is started. >> >> Pick any of a number of possible events that change the machine's >> state. We can wave our hands at some things saying they don't matter >> and do one off solutions for others, or we can just have a robust way >> of handling this consistently. > > Both block live copy and cd-rom eject state can be solved with layered > block format drivers. I don't think a central place for random data > makes sense. State belongs near the device that maintains it, esp. if > the device is hot-pluggable, so it's easy to associate the state with > the device. > >>> >>> You're introducing the need for additional code in the management >>> layer, the care and feeding for the stateful non-config file. >> >> If a management layer ignores the stateful non-config file, as you >> like to call it, it'll get the same semantics it has today. I think >> managing a single thing is a whole lot easier than managing an NVRAM >> file, a block migration layering file, and all of the future things >> we're going to add once we decide they are important too. > > I disagree. Storing NVRAM as a disk image is a simple extension of > existing management tools. Block live-copy and cd-rom eject state > also make sense as per-image state if you take hotunplug and hotplug > into account. Everything can be stored in a block driver but when the data is highly structured, isn't it nice to expose it in a structured, human readable way? I know I'd personally prefer a text representation of CMOS than a binary blob. >> >>>>> If qemu crashes, these events are meaningless. If management >>>>> crashes, it has to query qemu for all state that it wants to keep >>>>> track of via events. >>>> >>>> Think power failure, not qemu crash. In the event of a power >>>> failure, any hardware change initiated by the guest ought to be >>>> consistent with when the guest has restarted. If you eject the >>>> CDROM tray and then lose power, its still ejected after the power >>>> comes back on. >>> >>> Not on all machines. >>> >>> Let's list guest state which is independent of power. That would be >>> wither NVRAM of various types, or physical alterations. CD-ROM >>> eject is one. Are there others? >> >> Any indirect qemu state. Block migration is an example, but other >> examples would be VNC server information (like current password), WCE >> setting (depending on whether we modelled eeprom for the drivers), >> and persisted device settings (lots of devices have eeprom these days). > > Device settings should be stored with the devices, not with qemu. > > Suppose we take the cold-plug on startup via the monitor approach. So > we start with a bare machine, cold plug stuff into it. Now qemu has > to reconcile the stateful non-config file with the hardware. What if > something has changed? A device moved into a different slot? Sorry, I'm confused. Is there anything in the stateful config file when we start up? If so, the act of starting up will add a bunch of hardware. > If a network card has eeprom, we can specify it with -device > rtl8139,eeprom=id, where id specifies a disk image for the eeprom. We could, but then we'll end up with a bunch of little block devices. That seems less than ideal to me. Technically, mac address is stored on eeprom and we store that as a device property today. We can't persist device properties even though you can change the mac address of a network card and it does persist across reboots. Are you advocating that we introduce an eeprom for every network card (all in a slightly different format) and have special tools to manipulate the eeprom to store and view the mac address? Regards, Anthony Liguori >>> I think my solution (multiplexing block format driver) fits the >>> requirements for live-copy perfectly. In fact it has a name - it's >>> a RAID-1 driver started in degraded mode. It could be useful other >>> use cases. >> >> It feels a bit awkward to me to be honest. >> > > Not to me. >
On Thu, Feb 24, 2011 at 10:58:10AM +0200, Avi Kivity wrote: > On 02/23/2011 07:49 PM, Marcelo Tosatti wrote: > >On Wed, Feb 23, 2011 at 03:01:14PM +0200, Avi Kivity wrote: > >> On 02/23/2011 01:14 AM, Anthony Liguori wrote: > >> > > >> >-drive already ties into the qemuopts infrastructure and we have > >> >readconfig and writeconfig. I don't think we're missing any major > >> >pieces to do this in a more proper fashion. > >> > >> The problem with qemu config files is that it splits the > >> authoritative source of where images are stored into two. Is it in > >> the management tool's database or is it in qemu's config file? > >> > >> For the problem at hand, one solution is to make qemu stop after the > >> copy, and then management can issue an additional command to > >> rearrange the disk and resume the guest. A drawback here is that if > >> management dies, the guest is stopped until it restarts. We also > >> make management latency guest visible, even if it doesn't die at an > >> inconvenient place. > >> > >> An alternative approach is to have the copy be performed by a new > >> layered block format driver: > >> > >> - create a new image, type = live-copy, containing three pieces of > >> information > >> - source image > >> - destination image > >> - copy state (initially nothing is copied) > >> - tell qemu switch to the new image There is a similar situation with atomicity here. Mgmt app requests a switch and dies immediately, before receiving the command reply. Qemu crashes. Which image is the uptodate one, source or live-copy? > >> - qemu starts copying, updates copy state as needed > >> - copy finishes, event is emitted; reads and writes still serviced > >> - management receives event, switches qemu to destination image > >> - management removes live-copy image > >> > >> If management dies while this is happening, it can simply query the > >> state of the copy. > >> Similarly, if qemu dies, the copy state is persistent (could be 0/1 or > >> real range of blocks). > > > >You don't know if a given block is uptodate or not without the dirty > >bitmap. So unless you also keep track of dirty log somehow, this is > >meaningless. > > First, you no longer need the dirty bitmap. Since all writes go > through the layered block format driver, you know first-hand what is > dirty and what isn't. > > >So a commit file as proposed indicates copy state (in 0/1 fashion). The > >difference in your proposal is that such information is stored inside a > >special purpose image format? > > > > It could also store the already synced range. > > The difference is that the file is self contained. > You could hot-unplug the image and hot-plug it later (continuing the > copy with qemu-img), Then there's no need for live copy. qemu-img does that already. > or live migrate it. You can live migrate (but not live migrate with live block migration) with live copy in progress, its just that its not supported yet. > In fact I think a qemu RAID-1 driver > removes the restriction that you can't live-migrate and live-copy > simultaneously. As mentioned its just an implementation detail.
On 02/24/2011 05:00 PM, Anthony Liguori wrote: > On 02/24/2011 02:54 AM, Avi Kivity wrote: >> On 02/23/2011 10:18 PM, Anthony Liguori wrote: >>>> Then the management stack has to worry about yet another way of >>>> interacting via qemu. >>> >>> >>> { 'StateItem': { 'key': 'str', 'value': 'str' } } >>> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ >>> 'StateItem' ] } } >>> { 'StateInfo': { 'sections': [ 'StateSection' ] } } >>> >>> { 'query-state', {}, {}, 'StateInfo' } >>> >>> A management tool never need to worry about anything other than this >>> command if it so chooses. If we have the pre-machine init mode for >>> 0.16, then this can even be used to inspect state without running a >>> guest. >> >> So we have yet another information tree. If we store the cd-rom >> eject state here, then we need to make an association between the >> device path of the cd-rom, and the StateItem key. > > And this linkage is key. > > Let's say I launch QEMU with: > > qemu -cdrom ~/foo.img > > And then in the monitor, I do: > > (qemu) eject ide1-cd0 > > The question is, what command can I now use to launch the same qemu > instance? > > When I think of stateful config, what I really think of is a way to > spit out a command line that essentially becomes, "this is how you now > launch QEMU". > > In this case, it would be: > > qemu -cdrom ~/foo.img -device ide-disk,id=ide1-cd0,drive= > > Or, we could think of this in terms of: > > qemu -cdrom ~/foo.img -readconfig foo.cfg > > Where foo.cfg contained: > > [device "ide1-cd0"] > driver="ide-disk" > drive="" > > So what I'm really suggesting is that we generate foo.cfg whenever > monitor commands do things that change the command line and introduce > a new option to reflect this, IOW: > > qemu -cdrom ~/foo.img -config foo.cfg If you move the cdrom to a different IDE channel, you have to update the stateful non-config file. Whereas if you do $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img $ qemu -cdrom ~/foo-media-tray.img the cd-rom tray state will be tracked in the image file. > >> Far better to store it in the device itself. For example, we could >> make a layered block format driver that stores the eject state and a >> "backing file" containing the actual media. Eject and media change >> would be recorded in the block format driver's state. You could then >> hot-unplug a USB cd-writer and hot-plug it back into a different >> guest, implementing a virtual sneakernet. > > I think you're far too hung up on "store it in the device itself". > The recipe to create the device model is not intrinsic to the device > model. It's an independent thing that's a combination of the command > line arguments and any executed monitor commands. > > Maybe a better way to think about the stateful config file is a > mechanism to replay the monitor history. Again the question is who is the authoritative source of the configuration. Is it the management tool or is it qemu? The management tool already has to keep track of (the optional parts of) the guest device tree. It cannot start reading the stateful non-config file at random points in time. So all that is left is the guest controlled portions of the device tree, which are pretty rare, and random events like live-copy migration. I think that introducing a new authoritative source of information will create a lot of problems. >>> >>> The fact that the state is visible in the filesystem is an >>> implementation detail. >> >> A detail that has to be catered for by the management stack - it has >> to provide a safe place for it, back it up, etc. > > If it cares for QEMU to preserve state. Today, this all gets thrown > away. Right, but we should make it easy, not hard. > >>> It doesn't work for eject unless you interpose an acknowledged >>> event. Ultimately, this is a simple problem. If you want >>> reliability, we either need symmetric RPCs so that the device model >>> can call (and wait) to the management layer to acknowledge a change >>> or QEMU can post an event to the management layer, and maintain the >>> state in a reliable fashion. >> >> I don't see why it doesn't work. Please explain. > > 1) guest eject > 2) qemu posts eject event > 3) qemu acknowledges eject to the guest > 4) management tool sees eject event and updates guest config > > There's a race between 3 & 4. It can only be addressed by interposing > 4 between 2 and 3 OR making qemu persist this state between 2 and 3 > such that the management tool can reliably query it. If "it" is my cd-rom tray block format driver, it works. It's really the same in action as the stateful non-config, except it's part of the device/image, not a central location. >> I disagree. Storing NVRAM as a disk image is a simple extension of >> existing management tools. Block live-copy and cd-rom eject state >> also make sense as per-image state if you take hotunplug and hotplug >> into account. > > Everything can be stored in a block driver but when the data is highly > structured, isn't it nice to expose it in a structured, human readable > way? I know I'd personally prefer a text representation of CMOS than > a binary blob. Have a tool expose it. Part of the range is unspecified anyway. Using a block format driver means that we don't have to care about a crash during a write, that we can snapshot it, etc. >> Device settings should be stored with the devices, not with qemu. >> >> Suppose we take the cold-plug on startup via the monitor approach. >> So we start with a bare machine, cold plug stuff into it. Now qemu >> has to reconcile the stateful non-config file with the hardware. >> What if something has changed? A device moved into a different slot? > > Sorry, I'm confused. Is there anything in the stateful config file > when we start up? If so, the act of starting up will add a bunch of > hardware. Suppose it has information about ide1-cd0's media tray. Now we restart qemu and cold-plug the cdrom into ide0-cd0. What happens to the information? Or do we store both the cdrom and media tray information in there? If so the management stack has to edit the file to make changes (perhaps via the monitor). > >> If a network card has eeprom, we can specify it with -device >> rtl8139,eeprom=id, where id specifies a disk image for the eeprom. > > We could, but then we'll end up with a bunch of little block devices. > That seems less than ideal to me. > > Technically, mac address is stored on eeprom and we store that as a > device property today. We can't persist device properties even though > you can change the mac address of a network card and it does persist > across reboots. Are you advocating that we introduce an eeprom for > every network card (all in a slightly different format) and have > special tools to manipulate the eeprom to store and view the mac address? Yes -- if we really want to support it. Obviously we have to store the entire eeprom, not just the portion containing the MAC address, so it's not just a key/value store. A card may even have its firmware in flash.
On 02/24/2011 05:14 PM, Marcelo Tosatti wrote: > > >> The problem with qemu config files is that it splits the > > >> authoritative source of where images are stored into two. Is it in > > >> the management tool's database or is it in qemu's config file? > > >> > > >> For the problem at hand, one solution is to make qemu stop after the > > >> copy, and then management can issue an additional command to > > >> rearrange the disk and resume the guest. A drawback here is that if > > >> management dies, the guest is stopped until it restarts. We also > > >> make management latency guest visible, even if it doesn't die at an > > >> inconvenient place. > > >> > > >> An alternative approach is to have the copy be performed by a new > > >> layered block format driver: > > >> > > >> - create a new image, type = live-copy, containing three pieces of > > >> information > > >> - source image > > >> - destination image > > >> - copy state (initially nothing is copied) > > >> - tell qemu switch to the new image > > There is a similar situation with atomicity here. Mgmt app requests a > switch and dies immediately, before receiving the command reply. Qemu > crashes. Which image is the uptodate one, source or live-copy? live-copy (or it's new name, RAID-1). Once you've created it it is equivalent to source. Once it switches to state=synced you can switch back to either source or destination (I guess by telling qemu to detach the one you don't want first, so it falls back to state=degraded). > > > You could hot-unplug the image and hot-plug it later (continuing the > > copy with qemu-img), > > Then there's no need for live copy. qemu-img does that already. It will start from the beginning. > > or live migrate it. > > You can live migrate (but not live migrate with live block migration) > with live copy in progress, its just that its not supported yet. A RAID-1 driver will work with block live migration too. > > In fact I think a qemu RAID-1 driver > > removes the restriction that you can't live-migrate and live-copy > > simultaneously. > > As mentioned its just an implementation detail. I think it's an important one. It moves the code from the generic layer to a driver. It allows generic RAID-1 functionality (for high availablity).
On Thu, Feb 24, 2011 at 05:28:20PM +0200, Avi Kivity wrote: > On 02/24/2011 05:14 PM, Marcelo Tosatti wrote: > >> >> The problem with qemu config files is that it splits the > >> >> authoritative source of where images are stored into two. Is it in > >> >> the management tool's database or is it in qemu's config file? > >> >> > >> >> For the problem at hand, one solution is to make qemu stop after the > >> >> copy, and then management can issue an additional command to > >> >> rearrange the disk and resume the guest. A drawback here is that if > >> >> management dies, the guest is stopped until it restarts. We also > >> >> make management latency guest visible, even if it doesn't die at an > >> >> inconvenient place. > >> >> > >> >> An alternative approach is to have the copy be performed by a new > >> >> layered block format driver: > >> >> > >> >> - create a new image, type = live-copy, containing three pieces of > >> >> information > >> >> - source image > >> >> - destination image > >> >> - copy state (initially nothing is copied) > >> >> - tell qemu switch to the new image > > > >There is a similar situation with atomicity here. Mgmt app requests a > >switch and dies immediately, before receiving the command reply. Qemu > >crashes. Which image is the uptodate one, source or live-copy? > > live-copy (or it's new name, RAID-1). Once you've created it it is > equivalent to source. Once it switches to state=synced you can > switch back to either source or destination (I guess by telling qemu > to detach the one you don't want first, so it falls back to > state=degraded). > > > > >> You could hot-unplug the image and hot-plug it later (continuing the > >> copy with qemu-img), > > > >Then there's no need for live copy. qemu-img does that already. > > It will start from the beginning. > > >> or live migrate it. > > > >You can live migrate (but not live migrate with live block migration) > >with live copy in progress, its just that its not supported yet. > > A RAID-1 driver will work with block live migration too. Nobody cares about that one (block copy and block live migration in progress). In fact, i doubt anybody cares about parallel block migration and block copy either (mgmt can easily cope with that limitation, stop live copy if migration is needed). > >> In fact I think a qemu RAID-1 driver > >> removes the restriction that you can't live-migrate and live-copy > >> simultaneously. > > > >As mentioned its just an implementation detail. I meant the restriction of live-migrate and live-copy in parallel. > I think it's an important one. It moves the code from the generic > layer to a driver. Well it is a nice idea, but devil is in the details: - Guest writes must invalidate in progress live copy reads and live copy writes, so you have to maintain a queue for live copy AIO. - Live copy writes must be aware of in progress guest AIO writes, so you have to maintain a queue for guest AIO. - Guest writes must be mirrored to source and destination. - qemu-img must handle this new format. So my view ATM is that this is overengineering. > It allows generic RAID-1 functionality (for high > availablity). Isnt HA responsability of the host filesystem?
On 02/24/2011 06:39 PM, Marcelo Tosatti wrote: > > > > > >You can live migrate (but not live migrate with live block migration) > > >with live copy in progress, its just that its not supported yet. > > > > A RAID-1 driver will work with block live migration too. > > Nobody cares about that one (block copy and block live migration in > progress). In fact, i doubt anybody cares about parallel block migration > and block copy either (mgmt can easily cope with that limitation, stop > live copy if migration is needed). It's a lot better to have an orthogonal feature set where everything works with everything. These sort of constraints aren't very good, especially as they aren't documented. The code that deals with live migration and live copy may be in different places and now we force the management tool authors to tangle it up. > > >> In fact I think a qemu RAID-1 driver > > >> removes the restriction that you can't live-migrate and live-copy > > >> simultaneously. > > > > > >As mentioned its just an implementation detail. > > I meant the restriction of live-migrate and live-copy in parallel. > > > I think it's an important one. It moves the code from the generic > > layer to a driver. > > Well it is a nice idea, but devil is in the details: > > - Guest writes must invalidate in progress live copy reads > and live copy writes, so you have to maintain a queue for live > copy AIO. You just add the locations to a queue and fire them off again when the live copy write returns. > - Live copy writes must be aware of in progress guest AIO writes, so > you have to maintain a queue for guest AIO. > - Guest writes must be mirrored to source and destination. Don't you need to do this for live-copy? Ah, you do it asynchronously from the dirty log. You can do the same with the RAID-1 driver by invalidating the destination block instead of dual-issuing the write. > - qemu-img must handle this new format. > > So my view ATM is that this is overengineering. I believe it will actually be smaller since there's no interfacing with the dirty log. > > It allows generic RAID-1 functionality (for high > > availablity). > > Isnt HA responsability of the host filesystem? So is sparseness, snapshots, etc. Yet we do them in qemu. In a perfect world we'd have just raw files (and use RAID over loop or something for live-copy).
On 02/24/2011 10:39 AM, Marcelo Tosatti wrote: > On Thu, Feb 24, 2011 at 05:28:20PM +0200, Avi Kivity wrote: > >> On 02/24/2011 05:14 PM, Marcelo Tosatti wrote: >> >>>> >> The problem with qemu config files is that it splits the >>>> >> authoritative source of where images are stored into two. Is it in >>>> >> the management tool's database or is it in qemu's config file? >>>> >> >>>> >> For the problem at hand, one solution is to make qemu stop after the >>>> >> copy, and then management can issue an additional command to >>>> >> rearrange the disk and resume the guest. A drawback here is that if >>>> >> management dies, the guest is stopped until it restarts. We also >>>> >> make management latency guest visible, even if it doesn't die at an >>>> >> inconvenient place. >>>> >> >>>> >> An alternative approach is to have the copy be performed by a new >>>> >> layered block format driver: >>>> >> >>>> >> - create a new image, type = live-copy, containing three pieces of >>>> >> information >>>> >> - source image >>>> >> - destination image >>>> >> - copy state (initially nothing is copied) >>>> >> - tell qemu switch to the new image >>>> >>> There is a similar situation with atomicity here. Mgmt app requests a >>> switch and dies immediately, before receiving the command reply. Qemu >>> crashes. Which image is the uptodate one, source or live-copy? >>> >> live-copy (or it's new name, RAID-1). Once you've created it it is >> equivalent to source. Once it switches to state=synced you can >> switch back to either source or destination (I guess by telling qemu >> to detach the one you don't want first, so it falls back to >> state=degraded). >> >> >>> >>>> You could hot-unplug the image and hot-plug it later (continuing the >>>> copy with qemu-img), >>>> >>> Then there's no need for live copy. qemu-img does that already. >>> >> It will start from the beginning. >> >> >>>> or live migrate it. >>>> >>> You can live migrate (but not live migrate with live block migration) >>> with live copy in progress, its just that its not supported yet. >>> >> A RAID-1 driver will work with block live migration too. >> > Nobody cares about that one (block copy and block live migration in > progress). In fact, i doubt anybody cares about parallel block migration > and block copy either (mgmt can easily cope with that limitation, stop > live copy if migration is needed). > > >>>> In fact I think a qemu RAID-1 driver >>>> removes the restriction that you can't live-migrate and live-copy >>>> simultaneously. >>>> >>> As mentioned its just an implementation detail. >>> > I meant the restriction of live-migrate and live-copy in parallel. > > >> I think it's an important one. It moves the code from the generic >> layer to a driver. >> > Well it is a nice idea, but devil is in the details: > > - Guest writes must invalidate in progress live copy reads > and live copy writes, so you have to maintain a queue for live > copy AIO. > - Live copy writes must be aware of in progress guest AIO writes, so > you have to maintain a queue for guest AIO. > - Guest writes must be mirrored to source and destination. > - qemu-img must handle this new format. > > So my view ATM is that this is overengineering. > Yeah, it feels like we're introducing QEMU level RAID. At what point are we going to add RAID5 support and not just RAID1. And it certainly begs the question of whether this use-case can be satisfied by just using Linux's RAID stack leaving one drive degraded and enabling the other drive when the time comes to fail over. Regards, Anthony Liguori
On 02/24/2011 09:22 AM, Avi Kivity wrote: > On 02/24/2011 05:00 PM, Anthony Liguori wrote: >> On 02/24/2011 02:54 AM, Avi Kivity wrote: >>> On 02/23/2011 10:18 PM, Anthony Liguori wrote: >>>>> Then the management stack has to worry about yet another way of >>>>> interacting via qemu. >>>> >>>> >>>> { 'StateItem': { 'key': 'str', 'value': 'str' } } >>>> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ >>>> 'StateItem' ] } } >>>> { 'StateInfo': { 'sections': [ 'StateSection' ] } } >>>> >>>> { 'query-state', {}, {}, 'StateInfo' } >>>> >>>> A management tool never need to worry about anything other than >>>> this command if it so chooses. If we have the pre-machine init >>>> mode for 0.16, then this can even be used to inspect state without >>>> running a guest. >>> >>> So we have yet another information tree. If we store the cd-rom >>> eject state here, then we need to make an association between the >>> device path of the cd-rom, and the StateItem key. >> >> And this linkage is key. >> >> Let's say I launch QEMU with: >> >> qemu -cdrom ~/foo.img >> >> And then in the monitor, I do: >> >> (qemu) eject ide1-cd0 >> >> The question is, what command can I now use to launch the same qemu >> instance? >> >> When I think of stateful config, what I really think of is a way to >> spit out a command line that essentially becomes, "this is how you >> now launch QEMU". >> >> In this case, it would be: >> >> qemu -cdrom ~/foo.img -device ide-disk,id=ide1-cd0,drive= >> >> Or, we could think of this in terms of: >> >> qemu -cdrom ~/foo.img -readconfig foo.cfg >> >> Where foo.cfg contained: >> >> [device "ide1-cd0"] >> driver="ide-disk" >> drive="" >> >> So what I'm really suggesting is that we generate foo.cfg whenever >> monitor commands do things that change the command line and introduce >> a new option to reflect this, IOW: >> >> qemu -cdrom ~/foo.img -config foo.cfg > > If you move the cdrom to a different IDE channel, you have to update > the stateful non-config file. > > Whereas if you do > > $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img > $ qemu -cdrom ~/foo-media-tray.img > > the cd-rom tray state will be tracked in the image file. Yeah, but how do you move it? If you do a remove/add through QMP, then the config file will reflect things just fine. If you want to do it outside of QEMU, then you can just ignore the config file and manage all of the state yourself. But it's never going to work as well (it will be racy) and you're pushing a tremendous amount of knowledge that ultimately belongs in QEMU (what state needs to persist) to something that isn't QEMU which means it's probably not going to be done correctly. I know you're a big fan of the omnipotent management tool but my experience has been that we need to help the management tooling folks more by expecting less from them. >>> Far better to store it in the device itself. For example, we could >>> make a layered block format driver that stores the eject state and a >>> "backing file" containing the actual media. Eject and media change >>> would be recorded in the block format driver's state. You could >>> then hot-unplug a USB cd-writer and hot-plug it back into a >>> different guest, implementing a virtual sneakernet. >> >> I think you're far too hung up on "store it in the device itself". >> The recipe to create the device model is not intrinsic to the device >> model. It's an independent thing that's a combination of the command >> line arguments and any executed monitor commands. >> >> Maybe a better way to think about the stateful config file is a >> mechanism to replay the monitor history. > > Again the question is who is the authoritative source of the > configuration. Is it the management tool or is it qemu? QEMU. No question about it. At any point in time, we are the authoritative source of what the guest's configuration is. There's no doubt about it. A management tool can try to keep up with us, but ultimately we are the only ones that know for sure. We have all of this information internally. Just persisting it is not a major architectural change. It's something we should have been doing (arguably) from the very beginning. > The management tool already has to keep track of (the optional parts > of) the guest device tree. It cannot start reading the stateful > non-config file at random points in time. So all that is left is the > guest controlled portions of the device tree, which are pretty rare, > and random events like live-copy migration. I think that introducing > a new authoritative source of information will create a lot of problems. QEMU has always been the authoritative source. Nothing new has been introduced. We never persisted the machine's configuration which meant management tools had to try to aggressively keep up with us which is intrinsically error prone. Fixing this will only improve existing management tools. > >>>> >>>> The fact that the state is visible in the filesystem is an >>>> implementation detail. >>> >>> A detail that has to be catered for by the management stack - it has >>> to provide a safe place for it, back it up, etc. >> >> If it cares for QEMU to preserve state. Today, this all gets thrown >> away. > > Right, but we should make it easy, not hard. Yeah, I fail to see how this makes it hard. We conveniently are saying, hey, this is all the state that needs to be persisted. We'll persist it for you if you want, otherwise, we'll expose it in a central location. If the tool wants to ignore it and guess based on various combinations of other commands, more power to it. >> >>>> It doesn't work for eject unless you interpose an acknowledged >>>> event. Ultimately, this is a simple problem. If you want >>>> reliability, we either need symmetric RPCs so that the device model >>>> can call (and wait) to the management layer to acknowledge a change >>>> or QEMU can post an event to the management layer, and maintain the >>>> state in a reliable fashion. >>> >>> I don't see why it doesn't work. Please explain. >> >> 1) guest eject >> 2) qemu posts eject event >> 3) qemu acknowledges eject to the guest >> 4) management tool sees eject event and updates guest config >> >> There's a race between 3 & 4. It can only be addressed by >> interposing 4 between 2 and 3 OR making qemu persist this state >> between 2 and 3 such that the management tool can reliably query it. > > If "it" is my cd-rom tray block format driver, it works. It's really > the same in action as the stateful non-config, except it's part of the > device/image, not a central location. Because you've introduced a one-off. Having a bunch of one-offs (especially being a bunch of new block formats!) is not going to make things simpler for management tools. >>> I disagree. Storing NVRAM as a disk image is a simple extension of >>> existing management tools. Block live-copy and cd-rom eject state >>> also make sense as per-image state if you take hotunplug and hotplug >>> into account. >> >> Everything can be stored in a block driver but when the data is >> highly structured, isn't it nice to expose it in a structured, human >> readable way? I know I'd personally prefer a text representation of >> CMOS than a binary blob. > > Have a tool expose it. Part of the range is unspecified anyway. I guess we need to agree to disagree then. > Using a block format driver means that we don't have to care about a > crash during a write, that we can snapshot it, etc. Why? We always need to care about a crash during write. What I've been thinking for a config file is the class approach of using a ~ and .# file to make sure that we write out the new file and then atomically rename it to get the new contents. Yeah, it's a bit heavy weight but this shouldn't be a very common thing to update. >>> Device settings should be stored with the devices, not with qemu. >>> >>> Suppose we take the cold-plug on startup via the monitor approach. >>> So we start with a bare machine, cold plug stuff into it. Now qemu >>> has to reconcile the stateful non-config file with the hardware. >>> What if something has changed? A device moved into a different slot? >> >> Sorry, I'm confused. Is there anything in the stateful config file >> when we start up? If so, the act of starting up will add a bunch of >> hardware. > > Suppose it has information about ide1-cd0's media tray. Now we > restart qemu and cold-plug the cdrom into ide0-cd0. What happens to > the information? Whether media is present is not a property of a blockdev, it's a property of a device. What does it even mean to use your media-tray format with something like a CMOS device? >> Technically, mac address is stored on eeprom and we store that as a >> device property today. We can't persist device properties even >> though you can change the mac address of a network card and it does >> persist across reboots. Are you advocating that we introduce an >> eeprom for every network card (all in a slightly different format) >> and have special tools to manipulate the eeprom to store and view the >> mac address? > > > Yes -- if we really want to support it. Obviously we have to store > the entire eeprom, not just the portion containing the MAC address, so > it's not just a key/value store. A card may even have its firmware in > flash. I think that's overengineering. I think we can go very far by just persisting small amounts of information in a central location. We're not building a cycle-accurate simulator here afterall. Regards, Anthony Liguori
On Wed, Feb 23, 2011 at 3:31 PM, Avi Kivity <avi@redhat.com> wrote: > On 02/23/2011 04:35 PM, Anthony Liguori wrote: >> >> QEMU uses the state database to store information that is created >> dynamically. Â For instance, devices added through device_add. Â A device >> added via -device wouldn't necessary get added to the state database. >> >> Practically speaking, it let's you invoke QEMU with a fixed command line, >> while still using the monitor to make changes that would otherwise require >> the command line being updated. > > Then the invoker quickly loses track of what the actual state is. Â It can't > just remember which commands it issued (presumably in response to the user > updating user visible state). Â It has to parse the stateful config file qemu > outputs. Â But at which points should it parse it? > > I don't think it's reasonable to have three different ways to interact with > qemu, all needed: the command line, reading and writing the stateful config > file, and the monitor. Â I'd rather push for starting qemu with a blank guest > and assembling (cold-plugging) all the hardware via the monitor before > starting the guest. This is why I asked about unifying the stateful config with qdev or QemuOpts in the machine init mailing list thread. If we have n different ways of toggling things inside QEMU including command-line, HMP, QMP, and stateful config, then a large chunk of QEMU is just going to be parsing and formatting these external interfaces. Stefan
On Wed, Feb 23, 2011 at 01:06:46PM -0600, Anthony Liguori wrote: > On 02/22/2011 11:00 AM, Marcelo Tosatti wrote: > >Index: qemu/qerror.h > >=================================================================== > >--- qemu.orig/qerror.h > >+++ qemu/qerror.h > >@@ -171,4 +171,13 @@ QError *qobject_to_qerror(const QObject > > #define QERR_VNC_SERVER_FAILED \ > > "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }" > > > >+#define QERR_BLOCKCOPY_IN_PROGRESS \ > >+ "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }" > > The caller already knows the device name by virtue of issuing the > command so this is redundant. > > I think a better approach would be a QERR_IN_PROGRESS 'data': { > 'operation': %s } > > For block copy, we'd say QERR_IN_PROGRESS("block copy"). > > >+ > >+#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \ > >+ "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }" > >+ > >+#define QERR_MIGRATION_IN_PROGRESS \ > >+ "{ 'class': 'MigrationInProgress', 'data': {} }" > > Then QERR_IN_PROGRESS("live migration") Can the error format change like that? What about applications that make use of it? If it can change, sure. (libvirt.git does not seem to be aware of MigrationInProgress). > > #endif /* QERROR_H */ > >Index: qemu/qmp-commands.hx > >=================================================================== > >--- qemu.orig/qmp-commands.hx > >+++ qemu/qmp-commands.hx > >@@ -581,6 +581,75 @@ Example: > > EQMP > > > > { > >+ .name = "block_copy", > >+ .args_type = "device:s,filename:s,commit_filename:s?,inc:-i", > >+ .params = "device filename [commit_filename] [-i]", > >+ .help = "live block copy device to image" > >+ "\n\t\t\t optional commit filename " > >+ "\n\t\t\t -i for incremental copy " > >+ "(base image shared between src and destination)", > >+ .user_print = monitor_user_noop, > >+ .mhandler.cmd_new = do_bdrv_copy, > >+ }, > >+ > >+SQMP > >+block-copy > >+------- > >+ > >+Live block copy. > > I'm not sure copy really describes what we're doing here. Maybe > migrate-block? Problem its easy to confuse "migrate-block" with "block migration". I could not come up with a better, non-confusing name than "live block copy". > >+Arguments: > >+ > >+- "device": device name (json-string) > >+- "filename": target image filename (json-string) > > Is this a created image? Is this an image to create? A previously created image. > To future proof for blockdev, we should make this argument optional > and if it's not specified throw an error about missing argument. > This let's us introduce an optional blockdev argument such that we > can use a blockdev name. What you mean "blockdev"? > > >+- "commit_filename": target commit filename (json-string, optional) > > I think we should drop this. Why? Sorry but this can't wait for non-config persistent storage. This mistake was made in the past with irqchip for example, lets not repeat it. Its OK to deprecate "commit_filename" in favour of its location in non-config persistent storage. Its not the end of the world for a mgmt app to handle change (not saying its not a good principle) such as this. > >+- "inc": incremental disk copy (json-bool, optional) > > Let's use the full name (incremental) and we need to describe in > detail what the semantics of this are. Will it scan the target > block device to identify identical blocks? No, it does not attempt to identify identical blocks, yet. You are right, i'll write down a document to describe these details. > > >+Example: > >+ > >+-> { "execute": "block_copy", > >+ "arguments": { "device": "ide0-hd1", > >+ "filename": "/mnt/new-disk.img", > >+ "commit_filename: "/mnt/commit-new-disk.img" > >+ } } > >+ > >+<- { "return": {} } > >+ > >+Notes: > >+ > >+(1) The 'query-block-copy' command should be used to check block copy progress > >+ and final result (this information is provided by the 'status' member) > >+(2) Boolean argument "inc" defaults to false > > We should also document error semantics. What errors are expected and why? Fair. > >+EQMP > >+ > >+ { > >+ .name = "block_copy_cancel", > >+ .args_type = "device:s", > >+ .params = "device", > >+ .help = "cancel live block copy", > >+ .user_print = monitor_user_noop, > >+ .mhandler.cmd_new = do_bdrv_copy_cancel, > >+ }, > >+ > >+SQMP > >+block_copy_cancel > >+-------------- > >+ > >+Cancel live block copy. > >+ > >+Arguments: > >+ > >+- device: device name (json-string) > >+ > >+Example: > >+ > >+-> { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } } > >+<- { "return": {} } > > cancel-block-migration? Again, conflicts with "block migration" from live migration. > What happens if: > - No block copy is active anymore (it's completed) cancel succeeds. > - A block copy was never started qerror_report(QERR_DEVICE_NOT_FOUND, device); > - device refers to a device that no longer exists qerror_report(QERR_DEVICE_NOT_FOUND, device); > - device refers to a device with no media present Right, this should be dealt with. > If this command succeeds, what the state of target? It will be used as the image backing the guest block device. But probably i don't understand your question. > If I resume the > operation with the incremental flag set, will it Just Work? As in: - block_copy ide0-hd1 /mnt/aa.img - block_copy ide0-hd1 /mnt/aa.img -i ? The second command will fail with QERR_BLOCKCOPY_IN_PROGRESS. > >+------------- > >+ > >+Live block copy status. > >+ > >+Each block copy instance information is stored in a json-object and the returned > >+value is a json-array of all instances. > >+ > >+Each json-object contains the following: > >+ > >+- "device": device name (json-string) > >+- "status": block copy status (json-string) > >+ - Possible values: "active", "failed", "completed" > >+- "info": A json-object with the statistics information, if status is "active": > >+ - "percentage": percentage completed (json-int) > > So let's fold this into query-block > > And let's also report the statistics as MigrationStats which is the format: > > { 'MigrationStats': {'transferred': 'int', 'remaining': 'int', > 'total': 'int' } } > > So I'd propose changing: > > { 'BlockDeviceStats': {'rd_bytes': 'int', 'wr_bytes': 'int', > 'rd_operations': 'int', 'wr_operations': 'int', > 'wr_highest_offset': 'int' } } > { 'BlockStats': {'*device': 'str', 'stats': 'BlockDeviceStats', > '*parent': 'BlockStats'} } > > [ 'query-blockstats', {}, {}, ['BlockStats'] ] > > To: > > { 'BlockMigrationStatus': [ 'active', 'inactive', 'cancelled' ] } > > { 'BlockMigrationInfo': {'status': 'BlockMigrationStatus', 'stats': > 'BlockMigrationStats', '*target_filename': 'str' } } > > { 'BlockDeviceStats': {'rd_bytes': 'int', 'wr_bytes': 'int', > 'rd_operations': 'int', 'wr_operations': 'int', > 'wr_highest_offset': 'int', '*migration': > 'BlockMigrationInfo' } } > { 'BlockStats': {'*device': 'str', 'stats': 'BlockDeviceStats', > '*parent': 'BlockStats'} } > > [ 'query-blockstats', {}, {}, ['BlockStats'] ] That makes sense, other than the name conflict already mentioned. Thanks
On 02/25/2011 06:02 PM, Marcelo Tosatti wrote: > On Wed, Feb 23, 2011 at 01:06:46PM -0600, Anthony Liguori wrote: > >> On 02/22/2011 11:00 AM, Marcelo Tosatti wrote: >> >>> Index: qemu/qerror.h >>> =================================================================== >>> --- qemu.orig/qerror.h >>> +++ qemu/qerror.h >>> @@ -171,4 +171,13 @@ QError *qobject_to_qerror(const QObject >>> #define QERR_VNC_SERVER_FAILED \ >>> "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }" >>> >>> +#define QERR_BLOCKCOPY_IN_PROGRESS \ >>> + "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }" >>> >> The caller already knows the device name by virtue of issuing the >> command so this is redundant. >> >> I think a better approach would be a QERR_IN_PROGRESS 'data': { >> 'operation': %s } >> >> For block copy, we'd say QERR_IN_PROGRESS("block copy"). >> >> >>> + >>> +#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \ >>> + "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }" >>> + >>> +#define QERR_MIGRATION_IN_PROGRESS \ >>> + "{ 'class': 'MigrationInProgress', 'data': {} }" >>> >> Then QERR_IN_PROGRESS("live migration") >> > Can the error format change like that? What about applications that make > use of it? If it can change, sure. (libvirt.git does not seem to be > aware of MigrationInProgress). > You're adding MigrationInProgress in this patch so I don't see what could be using it. > >>> #endif /* QERROR_H */ >>> Index: qemu/qmp-commands.hx >>> =================================================================== >>> --- qemu.orig/qmp-commands.hx >>> +++ qemu/qmp-commands.hx >>> @@ -581,6 +581,75 @@ Example: >>> EQMP >>> >>> { >>> + .name = "block_copy", >>> + .args_type = "device:s,filename:s,commit_filename:s?,inc:-i", >>> + .params = "device filename [commit_filename] [-i]", >>> + .help = "live block copy device to image" >>> + "\n\t\t\t optional commit filename " >>> + "\n\t\t\t -i for incremental copy " >>> + "(base image shared between src and destination)", >>> + .user_print = monitor_user_noop, >>> + .mhandler.cmd_new = do_bdrv_copy, >>> + }, >>> + >>> +SQMP >>> +block-copy >>> +------- >>> + >>> +Live block copy. >>> >> I'm not sure copy really describes what we're doing here. Maybe >> migrate-block? >> > Problem its easy to confuse "migrate-block" with "block migration". I > could not come up with a better, non-confusing name than "live block > copy". > Yeah, I don't think I like my suggested names any better to be honest. >>> +Arguments: >>> + >>> +- "device": device name (json-string) >>> +- "filename": target image filename (json-string) >>> >> Is this a created image? Is this an image to create? >> > A previously created image. > > >> To future proof for blockdev, we should make this argument optional >> and if it's not specified throw an error about missing argument. >> This let's us introduce an optional blockdev argument such that we >> can use a blockdev name. >> > What you mean "blockdev"? > -drive file=image.qcow2,if=none,id=foo 'foo' is a named blockdev. We don't have a way to add these through the monitor (yet) but we will for 0.15. >>> +- "commit_filename": target commit filename (json-string, optional) >>> >> I think we should drop this. >> > Why? Sorry but this can't wait for non-config persistent storage. This > mistake was made in the past with irqchip for example, lets not repeat > it. > > Its OK to deprecate "commit_filename" in favour of its location in > non-config persistent storage. > > Its not the end of the world for a mgmt app to handle change (not saying > its not a good principle) such as this. > Even as a one off, it's not a very good solution to the problem. We'd be way better of just having nothing here than using the commit file. What are the semantics of a half written file? How does a management tool detect a half written file? >> What happens if: >> - No block copy is active anymore (it's completed) >> > cancel succeeds. > I think it would be better to fail. >> - device refers to a device with no media present >> > Right, this should be dealt with. > > >> If this command succeeds, what the state of target? >> > It will be used as the image backing the guest block device. But > probably i don't understand your question. > If I execute block-copy, then I do block-copy-cancel, if it succeeds, am I guaranteed that we're still using the old block device? (The answer is no because we don't error out if the migration has completed already). >> If I resume the >> operation with the incremental flag set, will it Just Work? >> > As in: > > - block_copy ide0-hd1 /mnt/aa.img > - block_copy ide0-hd1 /mnt/aa.img -i > > ? > > The second command will fail with QERR_BLOCKCOPY_IN_PROGRESS. > No, as in: block_copy ide0-hd1 /mnt/aa.img block_copy_cancel ide0-h1 block_copy ide0-h1 /mnt/aa.img -i Does it pick up where it left off or does it start all over again? Regards, Anthony Liguori
On 02/25/2011 06:02 PM, Marcelo Tosatti wrote: >>> +- "commit_filename": target commit filename (json-string, optional) >>> >> I think we should drop this. >> > Why? Sorry but this can't wait for non-config persistent storage. This > mistake was made in the past with irqchip for example, lets not repeat > it. > > Its OK to deprecate "commit_filename" in favour of its location in > non-config persistent storage. > > Its not the end of the world for a mgmt app to handle change (not saying > its not a good principle) such as this If you're looking for a stop-gap, I'd suggest adding an optional "action" parameter. This parameter would take a string with valid values of "stop", "none". If action=stop, then when block copy finishes, the guest will be stopped and an event will be sent. This lets a management tool get notified, update it's config, and then continue the guest. If the management tool crashes, it can do a query-block-copy to see that the copy finished along with a query-status to fix the guest. This is more in line with how our other commands work. Regards, Anthony Liguori
On 02/24/2011 07:58 PM, Anthony Liguori wrote: >> If you move the cdrom to a different IDE channel, you have to update >> the stateful non-config file. >> >> Whereas if you do >> >> $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img >> $ qemu -cdrom ~/foo-media-tray.img >> >> the cd-rom tray state will be tracked in the image file. > > > Yeah, but how do you move it? There is no need to move the file at all. Simply point the new drive at the media tray. > If you do a remove/add through QMP, then the config file will reflect > things just fine. If all access to the state file is through QMP then it becomes more palatable. A bit on that later. > > If you want to do it outside of QEMU, then you can just ignore the > config file and manage all of the state yourself. But it's never > going to work as well (it will be racy) and you're pushing a > tremendous amount of knowledge that ultimately belongs in QEMU (what > state needs to persist) to something that isn't QEMU which means it's > probably not going to be done correctly. > > I know you're a big fan of the omnipotent management tool but my > experience has been that we need to help the management tooling folks > more by expecting less from them. I thought that's what I'm doing by separating the state out. It's easy for management to assemble configuration from their database and convert it into a centralized representation (like a qemu command line). It's a lot harder to disassemble a central state representation and move it back to the database. Using QMP is better than directly accessing the state file since qemu does the disassembly for you (provided the command references the device using its normal path, not some random key). The file just becomes a way to survive a crash, and all management needs to know about is to make it available and back it up. But it means that everything must be done via QMP, including assembly of the machine, otherwise the state file can become stale. Separating the state out to the device is even easier, since management is already expected to take care of disk images. All that's needed is to create the media tray image once, then you can forget about it completely. >> >> Again the question is who is the authoritative source of the >> configuration. Is it the management tool or is it qemu? > > QEMU. No question about it. At any point in time, we are the > authoritative source of what the guest's configuration is. There's no > doubt about it. A management tool can try to keep up with us, but > ultimately we are the only ones that know for sure. > > We have all of this information internally. Just persisting it is not > a major architectural change. It's something we should have been > doing (arguably) from the very beginning. That's a huge divergence from how management tools are written. Currently they contain the required guest configuration, a representation of what's the current live configuration, and they issue monitor commands to move the live configuration towards the required configuration (or just generate a qemu command line). What you're describing is completely different, I'm not even sure what it is. > >> The management tool already has to keep track of (the optional parts >> of) the guest device tree. It cannot start reading the stateful >> non-config file at random points in time. So all that is left is the >> guest controlled portions of the device tree, which are pretty rare, >> and random events like live-copy migration. I think that introducing >> a new authoritative source of information will create a lot of problems. > > QEMU has always been the authoritative source. Nothing new has been > introduced. We never persisted the machine's configuration which > meant management tools had to try to aggressively keep up with us > which is intrinsically error prone. Fixing this will only improve > existing management tools. If you look at management tools, they believe they are the authoritative source of configuration information (not guest state, which is more or less ignored). >> >> Right, but we should make it easy, not hard. > > Yeah, I fail to see how this makes it hard. We conveniently are > saying, hey, this is all the state that needs to be persisted. We'll > persist it for you if you want, otherwise, we'll expose it in a > central location. The state-in-a-file is just a blob. Don't expect the tool to parse it and reassociate the various bits to its own representation. Exposing it via QMP commands is a lot better though. > If the tool wants to ignore it and guess based on various combinations > of other commands, more power to it. >>>> I don't see why it doesn't work. Please explain. >>> >>> 1) guest eject >>> 2) qemu posts eject event >>> 3) qemu acknowledges eject to the guest >>> 4) management tool sees eject event and updates guest config >>> >>> There's a race between 3 & 4. It can only be addressed by >>> interposing 4 between 2 and 3 OR making qemu persist this state >>> between 2 and 3 such that the management tool can reliably query it. >> >> If "it" is my cd-rom tray block format driver, it works. It's really >> the same in action as the stateful non-config, except it's part of >> the device/image, not a central location. > > Because you've introduced a one-off. Having a bunch of one-offs > (especially being a bunch of new block formats!) is not going to make > things simpler for management tools. I don't see it as a one-off. We have some state, we store it in a state file. Like we store each image in its own file. >> >> Have a tool expose it. Part of the range is unspecified anyway. > > I guess we need to agree to disagree then. We can just extend our existing disagreement agreement. > >> Using a block format driver means that we don't have to care about a >> crash during a write, that we can snapshot it, etc. > > Why? Because block format drivers are already prepared for it. > We always need to care about a crash during write. What I've been > thinking for a config file is the class approach of using a ~ and .# > file to make sure that we write out the new file and then atomically > rename it to get the new contents. Yeah, it's a bit heavy weight but > this shouldn't be a very common thing to update. > >> >> Suppose it has information about ide1-cd0's media tray. Now we >> restart qemu and cold-plug the cdrom into ide0-cd0. What happens to >> the information? > > Whether media is present is not a property of a blockdev, it's a > property of a device. You're making it the property of a device path (I guess you could make it the property of a device's id). > What does it even mean to use your media-tray format with something > like a CMOS device? Nothing. > >>> Technically, mac address is stored on eeprom and we store that as a >>> device property today. We can't persist device properties even >>> though you can change the mac address of a network card and it does >>> persist across reboots. Are you advocating that we introduce an >>> eeprom for every network card (all in a slightly different format) >>> and have special tools to manipulate the eeprom to store and view >>> the mac address? >> >> >> Yes -- if we really want to support it. Obviously we have to store >> the entire eeprom, not just the portion containing the MAC address, >> so it's not just a key/value store. A card may even have its >> firmware in flash. > > I think that's overengineering. I think we can go very far by just > persisting small amounts of information in a central location. We're > not building a cycle-accurate simulator here afterall. Well, the whole thing is rather theoretical.
On 02/24/2011 07:45 PM, Anthony Liguori wrote: > > Yeah, it feels like we're introducing QEMU level RAID. We already did, with sheepdog (well the RAID code is really outside qemu itself). > At what point are we going to add RAID5 support and not just RAID1. > And it certainly begs the question of whether this use-case can be > satisfied by just using Linux's RAID stack leaving one drive degraded > and enabling the other drive when the time comes to fail over. The magic word is btrfs, we already have atomic live copy outside qemu ('cp --reflink'). If you also want an atomic switch you have to move it inside qemu so you can switch to the new file. Using one of the Linux RAID-1 implementations (md, dm, or drbd) is also possible but these are really designed for block devices, not files.
On 02/27/2011 11:10 AM, Avi Kivity wrote: > On 02/24/2011 07:58 PM, Anthony Liguori wrote: >>> If you move the cdrom to a different IDE channel, you have to update >>> the stateful non-config file. >>> >>> Whereas if you do >>> >>> $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img >>> $ qemu -cdrom ~/foo-media-tray.img >>> >>> the cd-rom tray state will be tracked in the image file. >> >> >> Yeah, but how do you move it? > > There is no need to move the file at all. Simply point the new drive at > the media tray. > >> If you do a remove/add through QMP, then the config file will reflect >> things just fine. > > If all access to the state file is through QMP then it becomes more > palatable. A bit on that later. > >> >> If you want to do it outside of QEMU, then you can just ignore the >> config file and manage all of the state yourself. But it's never going >> to work as well (it will be racy) and you're pushing a tremendous >> amount of knowledge that ultimately belongs in QEMU (what state needs >> to persist) to something that isn't QEMU which means it's probably not >> going to be done correctly. >> >> I know you're a big fan of the omnipotent management tool but my >> experience has been that we need to help the management tooling folks >> more by expecting less from them. > > I thought that's what I'm doing by separating the state out. It's easy > for management to assemble configuration from their database and convert > it into a centralized representation (like a qemu command line). It's a > lot harder to disassemble a central state representation and move it > back to the database. > > Using QMP is better than directly accessing the state file since qemu > does the disassembly for you (provided the command references the device > using its normal path, not some random key). The file just becomes a way > to survive a crash, and all management needs to know about is to make it > available and back it up. But it means that everything must be done via > QMP, including assembly of the machine, otherwise the state file can > become stale. > > Separating the state out to the device is even easier, since management > is already expected to take care of disk images. All that's needed is to > create the media tray image once, then you can forget about it completely. > >>> >>> Again the question is who is the authoritative source of the >>> configuration. Is it the management tool or is it qemu? >> >> QEMU. No question about it. At any point in time, we are the >> authoritative source of what the guest's configuration is. There's no >> doubt about it. A management tool can try to keep up with us, but >> ultimately we are the only ones that know for sure. >> >> We have all of this information internally. Just persisting it is not >> a major architectural change. It's something we should have been doing >> (arguably) from the very beginning. > > That's a huge divergence from how management tools are written. > Currently they contain the required guest configuration, a > representation of what's the current live configuration, and they issue > monitor commands to move the live configuration towards the required > configuration (or just generate a qemu command line). What you're > describing is completely different, I'm not even sure what it is. > >> >>> The management tool already has to keep track of (the optional parts >>> of) the guest device tree. It cannot start reading the stateful >>> non-config file at random points in time. So all that is left is the >>> guest controlled portions of the device tree, which are pretty rare, >>> and random events like live-copy migration. I think that introducing >>> a new authoritative source of information will create a lot of problems. >> >> QEMU has always been the authoritative source. Nothing new has been >> introduced. We never persisted the machine's configuration which meant >> management tools had to try to aggressively keep up with us which is >> intrinsically error prone. Fixing this will only improve existing >> management tools. > > If you look at management tools, they believe they are the authoritative > source of configuration information (not guest state, which is more or > less ignored). > >>> >>> Right, but we should make it easy, not hard. >> >> Yeah, I fail to see how this makes it hard. We conveniently are >> saying, hey, this is all the state that needs to be persisted. We'll >> persist it for you if you want, otherwise, we'll expose it in a >> central location. > > The state-in-a-file is just a blob. Don't expect the tool to parse it > and reassociate the various bits to its own representation. Exposing it > via QMP commands is a lot better though. > >> If the tool wants to ignore it and guess based on various combinations >> of other commands, more power to it. >>>>> I don't see why it doesn't work. Please explain. >>>> >>>> 1) guest eject >>>> 2) qemu posts eject event >>>> 3) qemu acknowledges eject to the guest >>>> 4) management tool sees eject event and updates guest config >>>> >>>> There's a race between 3 & 4. It can only be addressed by >>>> interposing 4 between 2 and 3 OR making qemu persist this state >>>> between 2 and 3 such that the management tool can reliably query it. >>> >>> If "it" is my cd-rom tray block format driver, it works. It's really >>> the same in action as the stateful non-config, except it's part of >>> the device/image, not a central location. >> >> Because you've introduced a one-off. Having a bunch of one-offs >> (especially being a bunch of new block formats!) is not going to make >> things simpler for management tools. > > I don't see it as a one-off. We have some state, we store it in a state > file. Like we store each image in its own file. > >>> >>> Have a tool expose it. Part of the range is unspecified anyway. >> >> I guess we need to agree to disagree then. > > We can just extend our existing disagreement agreement. > >> >>> Using a block format driver means that we don't have to care about a >>> crash during a write, that we can snapshot it, etc. >> >> Why? > > Because block format drivers are already prepared for it. > >> We always need to care about a crash during write. What I've been >> thinking for a config file is the class approach of using a ~ and .# >> file to make sure that we write out the new file and then atomically >> rename it to get the new contents. Yeah, it's a bit heavy weight but >> this shouldn't be a very common thing to update. > >> >>> >>> Suppose it has information about ide1-cd0's media tray. Now we >>> restart qemu and cold-plug the cdrom into ide0-cd0. What happens to >>> the information? >> >> Whether media is present is not a property of a blockdev, it's a >> property of a device. > > You're making it the property of a device path (I guess you could make > it the property of a device's id). > >> What does it even mean to use your media-tray format with something >> like a CMOS device? > > Nothing. > >> >>>> Technically, mac address is stored on eeprom and we store that as a >>>> device property today. We can't persist device properties even >>>> though you can change the mac address of a network card and it does >>>> persist across reboots. Are you advocating that we introduce an >>>> eeprom for every network card (all in a slightly different format) >>>> and have special tools to manipulate the eeprom to store and view >>>> the mac address? >>> >>> >>> Yes -- if we really want to support it. Obviously we have to store >>> the entire eeprom, not just the portion containing the MAC address, >>> so it's not just a key/value store. A card may even have its firmware >>> in flash. >> >> I think that's overengineering. I think we can go very far by just >> persisting small amounts of information in a central location. We're >> not building a cycle-accurate simulator here afterall. > > Well, the whole thing is rather theoretical. What about a simpler approach were QMP events will be written to a event-log-file (or even named pipe). This way apps will have the option of reading the log event log post mgmt crash and will be able to realize the correct guest state. This can solve the NVRAM changes and also the block live copy. It also keeps the management model the same and there is no need to mange these files since the persistent info is kep by the management else where.
On 02/27/2011 03:55 AM, Dor Laor wrote: > What about a simpler approach were QMP events will be written to a > event-log-file (or even named pipe). The management tool can just use a small daemon that does nothing other than write QMP events to a log. There's no real need for this code to live in QEMU. Since events are posted, even if we wrote it in QEMU, the event wouldn't be guaranteed on disk by the time the event invocation returns. Regards, Anthony Liguori
On 02/27/2011 03:10 AM, Avi Kivity wrote: > On 02/24/2011 07:58 PM, Anthony Liguori wrote: >>> If you move the cdrom to a different IDE channel, you have to update >>> the stateful non-config file. >>> >>> Whereas if you do >>> >>> $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img >>> $ qemu -cdrom ~/foo-media-tray.img >>> >>> the cd-rom tray state will be tracked in the image file. >> >> >> Yeah, but how do you move it? > > There is no need to move the file at all. Simply point the new drive > at the media tray. No, I was asking, how do you move the cdrom to a different IDE channel. Are you using QMP? Are you changing the command line arguments? > >> If you do a remove/add through QMP, then the config file will reflect >> things just fine. > > If all access to the state file is through QMP then it becomes more > palatable. A bit on that later. As I think I've mentioned before, I hadn't really thought about an opaque state file but I'm not necessary opposed to it. I don't see an obvious advantage to making it opaque but I agree it should be accessible via QMP. >> If you want to do it outside of QEMU, then you can just ignore the >> config file and manage all of the state yourself. But it's never >> going to work as well (it will be racy) and you're pushing a >> tremendous amount of knowledge that ultimately belongs in QEMU (what >> state needs to persist) to something that isn't QEMU which means it's >> probably not going to be done correctly. >> >> I know you're a big fan of the omnipotent management tool but my >> experience has been that we need to help the management tooling folks >> more by expecting less from them. > > I thought that's what I'm doing by separating the state out. It's > easy for management to assemble configuration from their database and > convert it into a centralized representation (like a qemu command > line). It's a lot harder to disassemble a central state > representation and move it back to the database. > > Using QMP is better than directly accessing the state file since qemu > does the disassembly for you (provided the command references the > device using its normal path, not some random key). The file just > becomes a way to survive a crash, and all management needs to know > about is to make it available and back it up. But it means that > everything must be done via QMP, including assembly of the machine, > otherwise the state file can become stale. > > Separating the state out to the device is even easier, since > management is already expected to take care of disk images. All > that's needed is to create the media tray image once, then you can > forget about it completely. Except that instead of having one state file, we might have a dozen additional "device state" files. >>> >>> Again the question is who is the authoritative source of the >>> configuration. Is it the management tool or is it qemu? >> >> QEMU. No question about it. At any point in time, we are the >> authoritative source of what the guest's configuration is. There's >> no doubt about it. A management tool can try to keep up with us, but >> ultimately we are the only ones that know for sure. >> >> We have all of this information internally. Just persisting it is >> not a major architectural change. It's something we should have been >> doing (arguably) from the very beginning. > > That's a huge divergence from how management tools are written. This is one of the reasons why management tooling around QEMU needs quite a bit of improving. There is simply no way a management tool can do a good job of being an authoritative source of configuration. The races we're discussion is a good example of why. But beyond those races, QEMU is the only entity that knows with certainty what bits of information are important to persist in order to preserve a guest across shutdown/restart. The fact that we've punted this problem for so long has only ensured that management tools are either intrinsically broken or only support the most minimal subset of functionality we actually support. > Currently they contain the required guest configuration, a > representation of what's the current live configuration, and they > issue monitor commands to move the live configuration towards the > required configuration (or just generate a qemu command line). What > you're describing is completely different, I'm not even sure what it is. Management tools shouldn't have to think about how the monitor commands they issue impact the invocation options of QEMU. > >> >>> The management tool already has to keep track of (the optional parts >>> of) the guest device tree. It cannot start reading the stateful >>> non-config file at random points in time. So all that is left is >>> the guest controlled portions of the device tree, which are pretty >>> rare, and random events like live-copy migration. I think that >>> introducing a new authoritative source of information will create a >>> lot of problems. >> >> QEMU has always been the authoritative source. Nothing new has been >> introduced. We never persisted the machine's configuration which >> meant management tools had to try to aggressively keep up with us >> which is intrinsically error prone. Fixing this will only improve >> existing management tools. > > If you look at management tools, they believe they are the > authoritative source of configuration information (not guest state, > which is more or less ignored). It's because we've given them no other option. >>> >>> Right, but we should make it easy, not hard. >> >> Yeah, I fail to see how this makes it hard. We conveniently are >> saying, hey, this is all the state that needs to be persisted. We'll >> persist it for you if you want, otherwise, we'll expose it in a >> central location. > > The state-in-a-file is just a blob. Don't expect the tool to parse it > and reassociate the various bits to its own representation. Exposing > it via QMP commands is a lot better though. I don't really see this as being a major issue. There's no such thing as a "blob". If someone wants to manipulate the state, they will. We need to keep compatibility to support migrating from version-to-version. I agree that we want to provide QMP interfaces to work with the state file. But I don't think we should be hostile to manual manipulation. Regards, Anthony Liguori
On 02/27/2011 04:00 PM, Anthony Liguori wrote: > On 02/27/2011 03:10 AM, Avi Kivity wrote: >> On 02/24/2011 07:58 PM, Anthony Liguori wrote: >>>> If you move the cdrom to a different IDE channel, you have to >>>> update the stateful non-config file. >>>> >>>> Whereas if you do >>>> >>>> $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img >>>> $ qemu -cdrom ~/foo-media-tray.img >>>> >>>> the cd-rom tray state will be tracked in the image file. >>> >>> >>> Yeah, but how do you move it? >> >> There is no need to move the file at all. Simply point the new drive >> at the media tray. > > No, I was asking, how do you move the cdrom to a different IDE > channel. Are you using QMP? Are you changing the command line > arguments? Yes. If we're doing hot-move (not really relevant to ide-cd) then you'd use QMP. If you're editing a virtual machine that is down, or scheduling a change for the next reboot, then you're using command line arguments (or cold-plugging into a stopped guest). Requiring management to remember the old configuration and issue delta commands to move the device for the cold-plug case is increased complexity IMO. > >> >>> If you do a remove/add through QMP, then the config file will >>> reflect things just fine. >> >> If all access to the state file is through QMP then it becomes more >> palatable. A bit on that later. > > As I think I've mentioned before, I hadn't really thought about an > opaque state file but I'm not necessary opposed to it. I don't see an > obvious advantage to making it opaque but I agree it should be > accessible via QMP. The advantage is that we keep the management tool talking to one interface (I don't think we should prevent users from interpreting it, just make it unnecessary). >> >> I thought that's what I'm doing by separating the state out. It's >> easy for management to assemble configuration from their database and >> convert it into a centralized representation (like a qemu command >> line). It's a lot harder to disassemble a central state >> representation and move it back to the database. >> >> Using QMP is better than directly accessing the state file since qemu >> does the disassembly for you (provided the command references the >> device using its normal path, not some random key). The file just >> becomes a way to survive a crash, and all management needs to know >> about is to make it available and back it up. But it means that >> everything must be done via QMP, including assembly of the machine, >> otherwise the state file can become stale. >> >> Separating the state out to the device is even easier, since >> management is already expected to take care of disk images. All >> that's needed is to create the media tray image once, then you can >> forget about it completely. > > Except that instead of having one state file, we might have a dozen > additional "device state" files. That is fine. We already have one state file per block device. >>> QEMU. No question about it. At any point in time, we are the >>> authoritative source of what the guest's configuration is. There's >>> no doubt about it. A management tool can try to keep up with us, >>> but ultimately we are the only ones that know for sure. >>> >>> We have all of this information internally. Just persisting it is >>> not a major architectural change. It's something we should have >>> been doing (arguably) from the very beginning. >> >> That's a huge divergence from how management tools are written. > > This is one of the reasons why management tooling around QEMU needs > quite a bit of improving. > > There is simply no way a management tool can do a good job of being an > authoritative source of configuration. The races we're discussion is > a good example of why. What we're discussing is not configuration. It is non-volatile state. Configuration comes from the user; state comes from the guest (the management tool may edit state; but the guest cannot edit the configuration). I agree 100% the management tool cannot be the authoritative source of state. My position is: - the management tool should be 100% in control of configuration (how the guest is put together from its components) - qemu should be 100% in control of state (memory, disk state, NVRAM in various components, cd-rom eject state, explosive bolts for payload separation, self-destruct mechanism, etc.) - the management tool should have access to state using the same identifiers it used to create the devices that contain the state - it is preferable to store state "in" the device so that when the configuration changes, state is maintained (like hot-unplug of a network card with NVRAM followed by hot-plug of the same card). - the angular momentum of the planet we (presumably) are on won't change, whatever we do [1] > > But beyond those races, QEMU is the only entity that knows with > certainty what bits of information are important to persist in order > to preserve a guest across shutdown/restart. The fact that we've > punted this problem for so long has only ensured that management tools > are either intrinsically broken or only support the most minimal > subset of functionality we actually support. I'm not arguing about that. I just want to stress again the difference between state and configuration. Qemu has no authority, in my mind, as to configuration. Only state. >> Currently they contain the required guest configuration, a >> representation of what's the current live configuration, and they >> issue monitor commands to move the live configuration towards the >> required configuration (or just generate a qemu command line). What >> you're describing is completely different, I'm not even sure what it is. > > Management tools shouldn't have to think about how the monitor > commands they issue impact the invocation options of QEMU. They have to, when creating a guest from scratch. But I admit, this throws a new light (for me) on things. What's the implications? - must have a qemu instance running when editing configuration, even when the guest is down - cannot add additional information to configuration; must store it in an external database and cross-reference it with the qemu data using the device ID - when editing non-hotpluggable configuration for the next boot, must maintain old config somewhere, so we can issue delta commands later (might be needed for current way of doing things) - no transactions/queries/etc except on non-authoritative source - issues with shared-nothing design (well, can store the configuration file using DRBD). >> >> If you look at management tools, they believe they are the >> authoritative source of configuration information (not guest state, >> which is more or less ignored). > > It's because we've given them no other option. It's the natural way of doing it. You have a web interface that talks to a database. When you want to list all VMs that have network cards on the production subnet, you issue a database query and get a recordset. How do you do that when the authoritative source of information is spread across a cluster? > >>>> >>>> Right, but we should make it easy, not hard. >>> >>> Yeah, I fail to see how this makes it hard. We conveniently are >>> saying, hey, this is all the state that needs to be persisted. >>> We'll persist it for you if you want, otherwise, we'll expose it in >>> a central location. >> >> The state-in-a-file is just a blob. Don't expect the tool to parse >> it and reassociate the various bits to its own representation. >> Exposing it via QMP commands is a lot better though. > > I don't really see this as being a major issue. There's no such thing > as a "blob". If someone wants to manipulate the state, they will. > We need to keep compatibility to support migrating from > version-to-version. > > I agree that we want to provide QMP interfaces to work with the state > file. But I don't think we should be hostile to manual manipulation. No, not hostile. We should make QMP commands sufficient to deal with it, that's all. [1] in fact, it does change, due to tidal effects.
On 02/27/2011 03:49 PM, Anthony Liguori wrote: > On 02/27/2011 03:55 AM, Dor Laor wrote: >> What about a simpler approach were QMP events will be written to a >> event-log-file (or even named pipe). > > The management tool can just use a small daemon that does nothing other > than write QMP events to a log. There's no real need for this code to > live in QEMU. > IIUC in case the management daemon will run qemu using named pipe for qmp it will happen automatically. If you agree to this approach it will simplify the more complex config file option (although it is nice to have as independent option for single hosts managed by simpler mgmts) > Since events are posted, even if we wrote it in QEMU, the event wouldn't > be guaranteed on disk by the time the event invocation returns. > > Regards, > > Anthony Liguori >
On 02/27/2011 10:02 AM, Dor Laor wrote: > On 02/27/2011 03:49 PM, Anthony Liguori wrote: >> On 02/27/2011 03:55 AM, Dor Laor wrote: >>> What about a simpler approach were QMP events will be written to a >>> event-log-file (or even named pipe). >> >> The management tool can just use a small daemon that does nothing other >> than write QMP events to a log. There's no real need for this code to >> live in QEMU. >> > > IIUC in case the management daemon will run qemu using named pipe for > qmp it will happen automatically. No, the event model is changing (it was always intended to change though). Events will need explicit registration so it's necessary to have bidirectional communication. So you can't just do -qmp foo -chardev file:event.log,id=foo. You won't actually see most of the events. Regards, Anthony Liguori > If you agree to this approach it will simplify the more complex config > file option (although it is nice to have as independent option for > single hosts managed by simpler mgmts) > >> Since events are posted, even if we wrote it in QEMU, the event wouldn't >> be guaranteed on disk by the time the event invocation returns. >> >> Regards, >> >> Anthony Liguori >> > >
On 02/27/2011 09:31 AM, Avi Kivity wrote: > On 02/27/2011 04:00 PM, Anthony Liguori wrote: >> On 02/27/2011 03:10 AM, Avi Kivity wrote: >>> On 02/24/2011 07:58 PM, Anthony Liguori wrote: >>>>> If you move the cdrom to a different IDE channel, you have to >>>>> update the stateful non-config file. >>>>> >>>>> Whereas if you do >>>>> >>>>> $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img >>>>> $ qemu -cdrom ~/foo-media-tray.img >>>>> >>>>> the cd-rom tray state will be tracked in the image file. >>>> >>>> >>>> Yeah, but how do you move it? >>> >>> There is no need to move the file at all. Simply point the new >>> drive at the media tray. >> >> No, I was asking, how do you move the cdrom to a different IDE >> channel. Are you using QMP? Are you changing the command line >> arguments? > > Yes. > > If we're doing hot-move (not really relevant to ide-cd) then you'd use > QMP. If you're editing a virtual machine that is down, or scheduling > a change for the next reboot, then you're using command line arguments > (or cold-plugging into a stopped guest). > > Requiring management to remember the old configuration and issue delta > commands to move the device for the cold-plug case is increased > complexity IMO. > >> >>> >>>> If you do a remove/add through QMP, then the config file will >>>> reflect things just fine. >>> >>> If all access to the state file is through QMP then it becomes more >>> palatable. A bit on that later. >> >> As I think I've mentioned before, I hadn't really thought about an >> opaque state file but I'm not necessary opposed to it. I don't see >> an obvious advantage to making it opaque but I agree it should be >> accessible via QMP. > > The advantage is that we keep the management tool talking to one > interface (I don't think we should prevent users from interpreting it, > just make it unnecessary). > >>> >>> I thought that's what I'm doing by separating the state out. It's >>> easy for management to assemble configuration from their database >>> and convert it into a centralized representation (like a qemu >>> command line). It's a lot harder to disassemble a central state >>> representation and move it back to the database. >>> >>> Using QMP is better than directly accessing the state file since >>> qemu does the disassembly for you (provided the command references >>> the device using its normal path, not some random key). The file >>> just becomes a way to survive a crash, and all management needs to >>> know about is to make it available and back it up. But it means >>> that everything must be done via QMP, including assembly of the >>> machine, otherwise the state file can become stale. >>> >>> Separating the state out to the device is even easier, since >>> management is already expected to take care of disk images. All >>> that's needed is to create the media tray image once, then you can >>> forget about it completely. >> >> Except that instead of having one state file, we might have a dozen >> additional "device state" files. > > That is fine. We already have one state file per block device. > >>>> QEMU. No question about it. At any point in time, we are the >>>> authoritative source of what the guest's configuration is. There's >>>> no doubt about it. A management tool can try to keep up with us, >>>> but ultimately we are the only ones that know for sure. >>>> >>>> We have all of this information internally. Just persisting it is >>>> not a major architectural change. It's something we should have >>>> been doing (arguably) from the very beginning. >>> >>> That's a huge divergence from how management tools are written. >> >> This is one of the reasons why management tooling around QEMU needs >> quite a bit of improving. >> >> There is simply no way a management tool can do a good job of being >> an authoritative source of configuration. The races we're discussion >> is a good example of why. > > What we're discussing is not configuration. It is non-volatile > state. Configuration comes from the user; state comes from the guest > (the management tool may edit state; but the guest cannot edit the > configuration). > > I agree 100% the management tool cannot be the authoritative source of > state. > > My position is: > - the management tool should be 100% in control of configuration (how > the guest is put together from its components) > - qemu should be 100% in control of state (memory, disk state, NVRAM > in various components, cd-rom eject state, explosive bolts for payload > separation, self-destruct mechanism, etc.) There simply is not such a clean separation between the two because things that the guest does affects the configuration of the guest. Hot plug, removable media eject, persistent device settings (whether it's CMOS or EEPROM) all disrupt this model. If you really wanted to have this separation, you'd have to be very strict about making all guest settings not be specified in config. You would need to do: qemu-img create -f e1000-eprom -o macaddr=12:23:45:67:78:90 e1000.0.rom qemu-img create -f e1000-eprom -o macaddr=12:23:45:67:78:91 e1000.1.rom qemu -device e1000,id=e1000.0,eeprom=e1000.0.rom -device e1000,id=e1000.1,eeprom=e1000.1.rom And now I need a tool that lets me modify e1000-eprom images if I want to change the mac address dynamically (say I'm trying to clone a VM). This type of model can be workable but as I said earlier, I think it's overengineering the problem. We don't separate configuration from guest state today. Instead of setting ourselves up for failure by setting an unrealistic standard that we try to achieve and never do, let's embrace the system that is working for us today. We are authoritative for everything and guest state is intimately tied to the virtual machine configuration. >> >> But beyond those races, QEMU is the only entity that knows with >> certainty what bits of information are important to persist in order >> to preserve a guest across shutdown/restart. The fact that we've >> punted this problem for so long has only ensured that management >> tools are either intrinsically broken or only support the most >> minimal subset of functionality we actually support. > > I'm not arguing about that. I just want to stress again the > difference between state and configuration. Qemu has no authority, in > my mind, as to configuration. Only state. Being the one that creates a guest based on configuration, I would say that we most certainly do. >>> Currently they contain the required guest configuration, a >>> representation of what's the current live configuration, and they >>> issue monitor commands to move the live configuration towards the >>> required configuration (or just generate a qemu command line). What >>> you're describing is completely different, I'm not even sure what it >>> is. >> >> Management tools shouldn't have to think about how the monitor >> commands they issue impact the invocation options of QEMU. > > They have to, when creating a guest from scratch. > > But I admit, this throws a new light (for me) on things. What's the > implications? > - must have a qemu instance running when editing configuration, even > when the guest is down QMP is an API. Whether a qemu instance is launched is an implementation detail. This could all be hidden completely with libqmp. > - cannot add additional information to configuration; must store it in > an external database and cross-reference it with the qemu data using > the device ID Don't confuse a management tool's notion of configuration with QEMU's configuration. A management tools config is used to initially create and then manipulate an existing guest. If the management tool supports out-of-band manipulation of a configuration file, then it needs to determine how the configuration file changed and execute the appropriate commands. > - when editing non-hotpluggable configuration for the next boot, must > maintain old config somewhere, so we can issue delta commands later > (might be needed for current way of doing things) Yes, it is. libvirt kind of cheats here and just deletes the old VM and creates a new one when editing the XML IIUC. > - no transactions/queries/etc except on non-authoritative source > - issues with shared-nothing design (well, can store the configuration > file using DRBD). In both cases, today a management tool races with QEMU so both of these points are currently true. >>> If you look at management tools, they believe they are the >>> authoritative source of configuration information (not guest state, >>> which is more or less ignored). >> >> It's because we've given them no other option. > > It's the natural way of doing it. You have a web interface that talks > to a database. When you want to list all VMs that have network cards > on the production subnet, you issue a database query and get a > recordset. How do you do that when the authoritative source of > information is spread across a cluster? This problem still exists today. A guest can eject a network card on it's own (without the management tool issuing a device_del command). QEMU will delete the NIC when this happens. The same is true with CDROM eject. Management tools are simply not authoritative today. Regards, Anthony Liguori
On 02/27/2011 07:41 PM, Anthony Liguori wrote: >> >> I agree 100% the management tool cannot be the authoritative source >> of state. >> >> My position is: >> - the management tool should be 100% in control of configuration (how >> the guest is put together from its components) >> - qemu should be 100% in control of state (memory, disk state, NVRAM >> in various components, cd-rom eject state, explosive bolts for >> payload separation, self-destruct mechanism, etc.) > > > There simply is not such a clean separation between the two because > things that the guest does affects the configuration of the guest. > > Hot plug, I don't think hotunplug works this way. When the guest ejects the pci or usb device, it simply stops working with the device and disconnects the power. There is nothing non-volatile going on, no spring-loaded lever that pushes the device out. If the server reboots immediately after hotunplug, but before the user physically removes the device, then the server will see the device when it boots up. > removable media eject, Here, we do have a single bit of non-volatile storage. > persistent device settings (whether it's CMOS or EEPROM) all disrupt > this model. These are just arrays of bits, most of them with no standard interpretation. So a block device fits them perfectly. > > If you really wanted to have this separation, you'd have to be very > strict about making all guest settings not be specified in config. > You would need to do: > > qemu-img create -f e1000-eprom -o macaddr=12:23:45:67:78:90 e1000.0.rom > qemu-img create -f e1000-eprom -o macaddr=12:23:45:67:78:91 e1000.1.rom > > qemu -device e1000,id=e1000.0,eeprom=e1000.0.rom -device > e1000,id=e1000.1,eeprom=e1000.1.rom > > And now I need a tool that lets me modify e1000-eprom images if I want > to change the mac address dynamically (say I'm trying to clone a VM). > > This type of model can be workable but as I said earlier, I think it's > overengineering the problem. In fact I don't think anyone wants this. Usually management wants the assigned MAC to be used without the guest playing games with it. So it's more or less pointless however it's implemented. > > We don't separate configuration from guest state today. Instead of > setting ourselves up for failure by setting an unrealistic standard > that we try to achieve and never do, let's embrace the system that is > working for us today. We are authoritative for everything and guest > state is intimately tied to the virtual machine configuration. "we are authoritative for everything" is a clean break from everything that's being done today. It's also a clean break from the model of central management plus database. We can't force it on people. Non-volatile state is not intimately tied to configuration. We store block device state completely outside the configuration. What's left is the CD-ROM tray, CMOS memory, and network card EEPROM. We could argue back and forth about where exactly they belong, but they aren't really worth the conversation since they are meaningless for real-life use. > >>> >>> But beyond those races, QEMU is the only entity that knows with >>> certainty what bits of information are important to persist in order >>> to preserve a guest across shutdown/restart. The fact that we've >>> punted this problem for so long has only ensured that management >>> tools are either intrinsically broken or only support the most >>> minimal subset of functionality we actually support. >> >> I'm not arguing about that. I just want to stress again the >> difference between state and configuration. Qemu has no authority, >> in my mind, as to configuration. Only state. > > Being the one that creates a guest based on configuration, I would say > that we most certainly do. That is not what being authoritative means. In a virt-manager deployment, libvirt is the authoritative source of guest configuration. In a RHEV-M deployment, the RHEV-M database is the authoritative source of guest configuration. You can completely replace the host machine and your guest will recreate just fine as long as the authoritative source is intact. > >>>> Currently they contain the required guest configuration, a >>>> representation of what's the current live configuration, and they >>>> issue monitor commands to move the live configuration towards the >>>> required configuration (or just generate a qemu command line). >>>> What you're describing is completely different, I'm not even sure >>>> what it is. >>> >>> Management tools shouldn't have to think about how the monitor >>> commands they issue impact the invocation options of QEMU. >> >> They have to, when creating a guest from scratch. >> >> But I admit, this throws a new light (for me) on things. What's the >> implications? >> - must have a qemu instance running when editing configuration, even >> when the guest is down > > QMP is an API. Whether a qemu instance is launched is an > implementation detail. This could all be hidden completely with libqmp. QMP is first and foremost a protocol. > >> - cannot add additional information to configuration; must store it >> in an external database and cross-reference it with the qemu data >> using the device ID > > Don't confuse a management tool's notion of configuration with QEMU's > configuration. > > A management tools config is used to initially create and then > manipulate an existing guest. If the management tool supports > out-of-band manipulation of a configuration file, then it needs to > determine how the configuration file changed and execute the > appropriate commands. I wasn't talking about that. I was talking about data that is meaningful to a user but not meaningful to qemu. That sort of data doesn't store well if qemu is the authoritative source. > Yes, it is. libvirt kind of cheats here and just deletes the old VM > and creates a new one when editing the XML IIUC. > >> - no transactions/queries/etc except on non-authoritative source >> - issues with shared-nothing design (well, can store the >> configuration file using DRBD). > > In both cases, today a management tool races with QEMU so both of > these points are currently true. No, it doesn't. If the guest ejects a network card, the network card is still there. Queries against the database still return correct results. > >>>> If you look at management tools, they believe they are the >>>> authoritative source of configuration information (not guest state, >>>> which is more or less ignored). >>> >>> It's because we've given them no other option. >> >> It's the natural way of doing it. You have a web interface that >> talks to a database. When you want to list all VMs that have network >> cards on the production subnet, you issue a database query and get a >> recordset. How do you do that when the authoritative source of >> information is spread across a cluster? > > This problem still exists today. A guest can eject a network card on > it's own (without the management tool issuing a device_del command). > QEMU will delete the NIC when this happens. I think that's a bug. > The same is true with CDROM eject. CDROM tray position is state, not configuration. > > Management tools are simply not authoritative today. > > Regards, > > Anthony Liguori
On 02/27/2011 07:25 PM, Anthony Liguori wrote: > On 02/27/2011 10:02 AM, Dor Laor wrote: >> On 02/27/2011 03:49 PM, Anthony Liguori wrote: >>> On 02/27/2011 03:55 AM, Dor Laor wrote: >>>> What about a simpler approach were QMP events will be written to a >>>> event-log-file (or even named pipe). >>> >>> The management tool can just use a small daemon that does nothing other >>> than write QMP events to a log. There's no real need for this code to >>> live in QEMU. >>> >> >> IIUC in case the management daemon will run qemu using named pipe for >> qmp it will happen automatically. > > No, the event model is changing (it was always intended to change > though). Events will need explicit registration so it's necessary to > have bidirectional communication. But the mgmt->qemu direction is only about event registration, it's not valuable info. It's a simpler approach than relaying on another config file and it is safer than having a separate mgmt daemon to log these events. Cross posting to libvirt-list to hear their view. > > So you can't just do -qmp foo -chardev file:event.log,id=foo. You won't > actually see most of the events. > > Regards, > > Anthony Liguori > >> If you agree to this approach it will simplify the more complex config >> file option (although it is nice to have as independent option for >> single hosts managed by simpler mgmts) >> >>> Since events are posted, even if we wrote it in QEMU, the event wouldn't >>> be guaranteed on disk by the time the event invocation returns. >>> >>> Regards, >>> >>> Anthony Liguori >>> >> >> > >
On 02/28/2011 02:38 AM, Avi Kivity wrote: >> >> We don't separate configuration from guest state today. Instead of >> setting ourselves up for failure by setting an unrealistic standard >> that we try to achieve and never do, let's embrace the system that is >> working for us today. We are authoritative for everything and guest >> state is intimately tied to the virtual machine configuration. > > "we are authoritative for everything" is a clean break from everything > that's being done today. It's also a clean break from the model of > central management plus database. We can't force it on people. No, it isn't. This has been the way QEMU has always been. QEMU has always and will always continue to be useful in the absence of a management layer. That means that it will mix modifications to configuration with guest actions. To avoid races, a management tool needs to either poll QEMU or listen for events to know when QEMU initiates a configuration change. This is how tools are written today. The only thing being discussed is how to handle a very small and rare race condition whereas QEMU may send an notification to a crashed management tool *and* QEMU crashes before the management tool restarts. The only two options to solve this problem are synchronous notifications or a QEMU global state file. Since the former is a radical change to our existing API, the later is a much more reasonable option. If a management tool doesn't care about this race, it can simply not use the global state file. Regards, Anthony Liguori
On 02/28/2011 02:45 PM, Anthony Liguori wrote: > On 02/28/2011 02:38 AM, Avi Kivity wrote: >>> >>> We don't separate configuration from guest state today. Instead of >>> setting ourselves up for failure by setting an unrealistic standard >>> that we try to achieve and never do, let's embrace the system that >>> is working for us today. We are authoritative for everything and >>> guest state is intimately tied to the virtual machine configuration. >> >> "we are authoritative for everything" is a clean break from >> everything that's being done today. It's also a clean break from the >> model of central management plus database. We can't force it on people. > > No, it isn't. This has been the way QEMU has always been. > > QEMU has always and will always continue to be useful in the absence > of a management layer. That means that it will mix modifications to > configuration with guest actions. > > To avoid races, a management tool needs to either poll QEMU or listen > for events to know when QEMU initiates a configuration change. This > is how tools are written today. > > The only thing being discussed is how to handle a very small and rare > race condition whereas QEMU may send an notification to a crashed > management tool *and* QEMU crashes before the management tool restarts. > > The only two options to solve this problem are synchronous > notifications or a QEMU global state file. Since the former is a > radical change to our existing API, the later is a much more > reasonable option. > > If a management tool doesn't care about this race, it can simply not > use the global state file. You're just ignoring what I've written.
On Feb 28, 2011 7:21 AM, "Avi Kivity" <avi@redhat.com> wrote: > > On 02/28/2011 02:45 PM, Anthony Liguori wrote: >> >> On 02/28/2011 02:38 AM, Avi Kivity wrote: >>>> >>>> >>>> We don't separate configuration from guest state today. Instead of setting ourselves up for failure by setting an unrealistic standard that we try to achieve and never do, let's embrace the system that is working for us today. We are authoritative for everything and guest state is intimately tied to the virtual machine configuration. >>> >>> >>> "we are authoritative for everything" is a clean break from everything that's being done today. It's also a clean break from the model of central management plus database. We can't force it on people. >> >> >> No, it isn't. This has been the way QEMU has always been. >> >> QEMU has always and will always continue to be useful in the absence of a management layer. That means that it will mix modifications to configuration with guest actions. >> >> To avoid races, a management tool needs to either poll QEMU or listen for events to know when QEMU initiates a configuration change. This is how tools are written today. >> >> The only thing being discussed is how to handle a very small and rare race condition whereas QEMU may send an notification to a crashed management tool *and* QEMU crashes before the management tool restarts. >> >> The only two options to solve this problem are synchronous notifications or a QEMU global state file. Since the former is a radical change to our existing API, the later is a much more reasonable option. >> >> If a management tool doesn't care about this race, it can simply not use the global state file. > > > You're just ignoring what I've written. No, you're just impervious to my subtle attempt to refocus the discussion on solving a practical problem. There's a lot of good, reasonably straight forward changes we can make that have a high return on investment. The only suggestion I'm making beyond Marcelo's original patch is that we use a structured format and that we make it possible to use the same file to solve this problem in multiple places. I don't think this creates a fundamental break in how management tools interact with QEMU. I don't think introducing RAID support in the block layer is a reasonable alternative. Regards, Anthony Liguori > > > -- > error compiling committee.c: too many arguments to function >
On 02/28/2011 07:33 PM, Anthony Liguori wrote: > > > > > You're just ignoring what I've written. > > No, you're just impervious to my subtle attempt to refocus the > discussion on solving a practical problem. > > There's a lot of good, reasonably straight forward changes we can make > that have a high return on investment. > Is making qemu the authoritative source of configuration information a straightforward change? Is the return on it high? Is the investment low? "No" to all three (ignoring for the moment whether it is good or not, which we were debating). > The only suggestion I'm making beyond Marcelo's original patch is that > we use a structured format and that we make it possible to use the > same file to solve this problem in multiple places. > No, you're suggesting a lot more than that. > I don't think this creates a fundamental break in how management tools > interact with QEMU. I don't think introducing RAID support in the > block layer is a reasonable alternative. > > Why not? Something that avoids the whole state thing altogether: - instead of atomically switching when live copy is done, keep on issuing writes to both the origin and the live copy - issue a notification to management - management receives the notification, and issues an atomic blockdev switch command this is really the RAID-1 solution but without the state file (credit Dor). An advantage is that there is no additional latency when trying to catch up to the dirty bitmap.
On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com> wrote: > > On 02/28/2011 07:33 PM, Anthony Liguori wrote: >> >> >> > >> > You're just ignoring what I've written. >> >> No, you're just impervious to my subtle attempt to refocus the discussion on solving a practical problem. >> >> There's a lot of good, reasonably straight forward changes we can make that have a high return on investment. >> > > Is making qemu the authoritative source of configuration information a straightforward change? Is the return on it high? Is the investment low? I think this is where we fundamentally disagree. My position is that QEMU is already the authoritative source. Having a state file doesn't change anything. Do a hot unplug of a network device with upstream libvirt with acpiphp unloaded, consult libvirt and then consult the monitor to see who has the right view of the guests config. To me, that's the definition of authoritative. > "No" to all three (ignoring for the moment whether it is good or not, which we were debating). > > >> The only suggestion I'm making beyond Marcelo's original patch is that we use a structured format and that we make it possible to use the same file to solve this problem in multiple places. >> > > No, you're suggesting a lot more than that. That's exactly what I'm suggesting from a technical perspective. >> I don't think this creates a fundamental break in how management tools interact with QEMU. I don't think introducing RAID support in the block layer is a reasonable alternative. >> >> > > Why not? Because its a lot of complexity and code that can go wrong while only solving the race for one specific case. Not to mention that we double the iop rate. > Something that avoids the whole state thing altogether: > > - instead of atomically switching when live copy is done, keep on issuing writes to both the origin and the live copy > - issue a notification to management > - management receives the notification, and issues an atomic blockdev switch command > this is really the RAID-1 solution but without the state file (credit Dor). An advantage is that there is no additional latency when trying to catch up to the dirty bitmap. It still suffers from the two generals problem. You cannot solve this without making one node reliable and that takes us back to it being either QEMU (posted event and state file) or the management tool (sync event). Regards, Anthony Liguori > > -- > error compiling committee.c: too many arguments to function >
On Mon, Feb 28, 2011 at 07:47:13PM +0200, Avi Kivity wrote: > On 02/28/2011 07:33 PM, Anthony Liguori wrote: > > > >> > >> You're just ignoring what I've written. > > > >No, you're just impervious to my subtle attempt to refocus the > >discussion on solving a practical problem. > > > >There's a lot of good, reasonably straight forward changes we can > >make that have a high return on investment. > > > > Is making qemu the authoritative source of configuration information > a straightforward change? Is the return on it high? Is the > investment low? > > "No" to all three (ignoring for the moment whether it is good or > not, which we were debating). > > >The only suggestion I'm making beyond Marcelo's original patch is > >that we use a structured format and that we make it possible to > >use the same file to solve this problem in multiple places. > > > > No, you're suggesting a lot more than that. > > >I don't think this creates a fundamental break in how management > >tools interact with QEMU. I don't think introducing RAID support > >in the block layer is a reasonable alternative. > > > > > > Why not? > > Something that avoids the whole state thing altogether: > > - instead of atomically switching when live copy is done, keep on > issuing writes to both the origin and the live copy > - issue a notification to management > - management receives the notification, and issues an atomic > blockdev switch command How do you know if QEMU performed the switch or not? That is, how can the switch command be atomic? > this is really the RAID-1 solution but without the state file > (credit Dor). An advantage is that there is no additional latency > when trying to catch up to the dirty bitmap. So you're implementing a virtual block driver not visible to the user as an image format. The image format would allow storage of persistent copy progress, etc. so you lose all that. All of that to avoid introducing a commit file which is not part of global qemu state.
On Sat, Feb 26, 2011 at 07:45:44AM -0600, Anthony Liguori wrote: > >>>+- "commit_filename": target commit filename (json-string, optional) > >>I think we should drop this. > >Why? Sorry but this can't wait for non-config persistent storage. This > >mistake was made in the past with irqchip for example, lets not repeat > >it. > > > >Its OK to deprecate "commit_filename" in favour of its location in > >non-config persistent storage. > > > >Its not the end of the world for a mgmt app to handle change (not saying > >its not a good principle) such as this. > > Even as a one off, it's not a very good solution to the problem. > We'd be way better of just having nothing here than using the commit > file. What are the semantics of a half written file? How does a > management tool detect a half written file? If the commit file contains the full commit message, it can be considered valid. Otherwise, it should be considered invalid. Stopping the guest and waiting for mgmt to issue a continue command is a solution, but it has drawbacks introduced by reliance on mgmt app (what if mgmt app crashes, latency, etc). But it seems that it is preferred over a commit file. Addressing the other comments in the meantime, thanks for input.
On Sat, Feb 26, 2011 at 07:45:44AM -0600, Anthony Liguori wrote: > >>>+- "filename": target image filename (json-string) > >>Is this a created image? Is this an image to create? > >A previously created image. > > > >>To future proof for blockdev, we should make this argument optional > >>and if it's not specified throw an error about missing argument. > >>This let's us introduce an optional blockdev argument such that we > >>can use a blockdev name. > >What you mean "blockdev"? > > -drive file=image.qcow2,if=none,id=foo > > 'foo' is a named blockdev. We don't have a way to add these through > the monitor (yet) but we will for 0.15. Fail to see the point of having a named blockdev to specify the target? > >>If I resume the > >>operation with the incremental flag set, will it Just Work? > >As in: > > > >- block_copy ide0-hd1 /mnt/aa.img > >- block_copy ide0-hd1 /mnt/aa.img -i > > > >? > > > >The second command will fail with QERR_BLOCKCOPY_IN_PROGRESS. > > No, as in: > > block_copy ide0-hd1 /mnt/aa.img > block_copy_cancel ide0-h1 > block_copy ide0-h1 /mnt/aa.img -i > > Does it pick up where it left off or does it start all over again? It starts all over again.
On 02/28/2011 08:12 PM, Anthony Liguori wrote: > > On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com > <mailto:avi@redhat.com>> wrote: > > > > On 02/28/2011 07:33 PM, Anthony Liguori wrote: > >> > >> > >> > > >> > You're just ignoring what I've written. > >> > >> No, you're just impervious to my subtle attempt to refocus the > discussion on solving a practical problem. > >> > >> There's a lot of good, reasonably straight forward changes we can > make that have a high return on investment. > >> > > > > Is making qemu the authoritative source of configuration information > a straightforward change? Is the return on it high? Is the investment low? > > I think this is where we fundamentally disagree. My position is that > QEMU is already the authoritative source. Having a state file doesn't > change anything. > > Do a hot unplug of a network device with upstream libvirt with acpiphp > unloaded, consult libvirt and then consult the monitor to see who has > the right view of the guests config. > > To me, that's the definition of authoritative. > > > "No" to all three (ignoring for the moment whether it is good or not, > which we were debating). > > > > > >> The only suggestion I'm making beyond Marcelo's original patch is > that we use a structured format and that we make it possible to use the > same file to solve this problem in multiple places. > >> > > > > No, you're suggesting a lot more than that. > > That's exactly what I'm suggesting from a technical perspective. > > >> I don't think this creates a fundamental break in how management > tools interact with QEMU. I don't think introducing RAID support in the > block layer is a reasonable alternative. > >> > >> > > > > Why not? > > Because its a lot of complexity and code that can go wrong while only > solving the race for one specific case. Not to mention that we double > the iop rate. > > > Something that avoids the whole state thing altogether: > > > > - instead of atomically switching when live copy is done, keep on > issuing writes to both the origin and the live copy > > - issue a notification to management > > - management receives the notification, and issues an atomic blockdev > switch command > > > this is really the RAID-1 solution but without the state file (credit > Dor). An advantage is that there is no additional latency when trying > to catch up to the dirty bitmap. > > It still suffers from the two generals problem. You cannot solve this > without making one node reliable and that takes us back to it being > either QEMU (posted event and state file) or the management tool (sync > event). It is safe w/o a state file by changing the basic live copy algorithm: 1. Live copy in progress stage Once live copy command is issued, a dirty bit map is created for tracking. There is a single pass over the entire image where we copy blocks from the src to the dst. Write commands for blocks that were already copied will be done twice for the src and dst. Once the full copy single pass ends, we trigger a QMP event that this stage can end. The live copy stage keeps running till the management issue a switch command. When it will happen, the switch is immediate and no need to copy additional blocks (but flush pending IOs). 2. Management sends a switch command. Qemu stops the doubling the IO and switches to the destination. End. Now let's see the error case: - qemu failure over stage #1 No matter what happens, the management will start qemu with the source image. The destination will be erased, no matter how much we copied. - management failure over stage #1 The new mgmt daemon needs to query qemu's status. Management can continue as before. - qemu+mgmt failure at stage #1 The management should just run qemu with the source image. - mgmt failure post sending stage #2 command. The mgmt DB states that we switched, just need to connect to qemu. - qemu failure before/after getting the stage #2 event. Management will need just to execute new qemu with the dst image - Failure of both qemu & mgmt in stage #2 The same as above. Pros: - Fast switch over time, minimal latency - No external storage/config needed - No need to wait for mgmt Thanks, Dor > > Regards, > > Anthony Liguori > > > > > -- > > error compiling committee.c: too many arguments to function > > >
On 02/28/2011 08:12 PM, Anthony Liguori wrote: > > > On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com > <mailto:avi@redhat.com>> wrote: > > > > On 02/28/2011 07:33 PM, Anthony Liguori wrote: > >> > >> > >> > > >> > You're just ignoring what I've written. > >> > >> No, you're just impervious to my subtle attempt to refocus the > discussion on solving a practical problem. > >> > >> There's a lot of good, reasonably straight forward changes we can > make that have a high return on investment. > >> > > > > Is making qemu the authoritative source of configuration information > a straightforward change? Is the return on it high? Is the > investment low? > > I think this is where we fundamentally disagree. My position is that > QEMU is already the authoritative source. Having a state file doesn't > change anything. > > Do a hot unplug of a network device with upstream libvirt with acpiphp > unloaded, consult libvirt and then consult the monitor to see who has > the right view of the guests config. > libvirt is right and the monitor is wrong. On real hardware, calling _EJ0 doesn't affect the configuration one little bit (if I understand it correctly). It just turns off power to the slot. If you power-cycle, the card will be there. In the real world, the authoritative source of configuration is a human with a screwdriver. The virtualized equivalent is the management tool. > To me, that's the definition of authoritative. > > > "No" to all three (ignoring for the moment whether it is good or > not, which we were debating). > > > > > >> The only suggestion I'm making beyond Marcelo's original patch is > that we use a structured format and that we make it possible to use > the same file to solve this problem in multiple places. > >> > > > > No, you're suggesting a lot more than that. > > That's exactly what I'm suggesting from a technical perspective. > Unless I'm hallucinating, you're suggesting quite a bit more. A revolution in how qemu is to be managed. > >> I don't think this creates a fundamental break in how management > tools interact with QEMU. I don't think introducing RAID support in > the block layer is a reasonable alternative. > >> > >> > > > > Why not? > > Because its a lot of complexity and code that can go wrong while only > solving the race for one specific case. Not to mention that we double > the iop rate. > IMO it's of similar complexity. The number of I/Os don't change (reads stay the same, and any write that has already been mirrored needs to be re-mirrored in both cases. We do gain lower latency switchover and we package the code as a block format driver instead of core block code. We decouple the dependencies from live migration. > > Something that avoids the whole state thing altogether: > > > > - instead of atomically switching when live copy is done, keep on > issuing writes to both the origin and the live copy > > - issue a notification to management > > - management receives the notification, and issues an atomic > blockdev switch command > > > this is really the RAID-1 solution but without the state file > (credit Dor). An advantage is that there is no additional latency > when trying to catch up to the dirty bitmap. > > It still suffers from the two generals problem. You cannot solve this > without making one node reliable and that takes us back to it being > either QEMU (posted event and state file) or the management tool (sync > event). > > It works without either. If qemu fails, you simply re-mirror everything. If the management tool fails, it re-subscribes to the mirror-complete event, queries whether it already happened in its absence, and if it did, requests the switchover.
On 02/28/2011 08:56 PM, Marcelo Tosatti wrote: > > > > Something that avoids the whole state thing altogether: > > > > - instead of atomically switching when live copy is done, keep on > > issuing writes to both the origin and the live copy > > - issue a notification to management > > - management receives the notification, and issues an atomic > > blockdev switch command > > How do you know if QEMU performed the switch or not? That is, > how can the switch command be atomic? If you fail while qemu processes the command, you query it to see what it's current state. You can simply re-issue the command; it's idempotent. If qemu fails before you issue the switch you discard the new copy. If it fails after the switch (whether it acked or not) you discard the original. > > this is really the RAID-1 solution but without the state file > > (credit Dor). An advantage is that there is no additional latency > > when trying to catch up to the dirty bitmap. > > So you're implementing a virtual block driver not visible to the user as > an image format. The image format would allow storage of persistent copy > progress, etc. so you lose all that. > > All of that to avoid introducing a commit file which is not part of > global qemu state. I think it has other advantages as well - code isolation, live migration compatibility, reduced switchover times. Really the bad thing about the commit file is its un-documented-ness and ad-hoc nature which means the management tool is unlikely to get things right. Qemu maintaining state is fine. It already does - in block devices, and should do more. We just have to take the right approach to do it (and be careful not to make it manage configuration).
On 03/01/2011 04:39 AM, Avi Kivity wrote: > On 02/28/2011 08:12 PM, Anthony Liguori wrote: >> >> >> On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com >> <mailto:avi@redhat.com>> wrote: >> > >> > On 02/28/2011 07:33 PM, Anthony Liguori wrote: >> >> >> >> >> >> > >> >> > You're just ignoring what I've written. >> >> >> >> No, you're just impervious to my subtle attempt to refocus the >> discussion on solving a practical problem. >> >> >> >> There's a lot of good, reasonably straight forward changes we can >> make that have a high return on investment. >> >> >> > >> > Is making qemu the authoritative source of configuration >> information a straightforward change? Is the return on it high? Is >> the investment low? >> >> I think this is where we fundamentally disagree. My position is that >> QEMU is already the authoritative source. Having a state file >> doesn't change anything. >> >> Do a hot unplug of a network device with upstream libvirt with >> acpiphp unloaded, consult libvirt and then consult the monitor to see >> who has the right view of the guests config. >> > > libvirt is right and the monitor is wrong. > > On real hardware, calling _EJ0 doesn't affect the configuration one > little bit (if I understand it correctly). It just turns off power to > the slot. If you power-cycle, the card will be there. It's up to the hardware vendor. Since it's ACPI, it can result in any number of operations. Usually, there's some logic to flip on an LED or something. There's nothing that prevents a vendor from ejecting the card. My point is that there aren't cleanly separated lines in the real world. >> To me, that's the definition of authoritative. >> >> > "No" to all three (ignoring for the moment whether it is good or >> not, which we were debating). >> > >> > >> >> The only suggestion I'm making beyond Marcelo's original patch is >> that we use a structured format and that we make it possible to use >> the same file to solve this problem in multiple places. >> >> >> > >> > No, you're suggesting a lot more than that. >> >> That's exactly what I'm suggesting from a technical perspective. >> > > Unless I'm hallucinating, you're suggesting quite a bit more. A > revolution in how qemu is to be managed. Let me take another route to see if I can't persuade you. First, let's clarify your proposal. You want to introduce a new block format that references to block devices. It may also store a dirty bitmap to keep track of which blocks are out of sync. Hopefully, it goes without saying that the dirty bitmap is strictly optional (it's a performance optimization) so let's ignore it. Your format, as a text file, looks like: [raid1] primary=diska.img secondary=diskb.img active=primary To use it, here's the sequence: 0) qemu uses disk A for a block device 1) create a raid1 block device pointing to disk A and disk B. 2) management tool asks qemu to us the new raid1 block device. 3) qemu acks (2) 4) at some point, the mirror completes, writes are going to both disks 5) qemu sends out an event indicating that the disks are in sync 6) management tool then sends a command to fail over to disk B 7) qemu acks (6) We're making the management tool the "authoritative" source of how to launch QEMU. That means that the management tool ultimately determines which command line to relaunch QEMU with. Here are the races: A) If QEMU crashes between (2) and (3), it may have issues a write to the new raid1 block device before the management tool sees (3). If this happens, when the management tool restarts QEMU with disk A, we're left with a dangling raid1 block device. Not a critical failure, but not ideal. B) If QEMU crashes between (6) and (7), QEMU may have started writing to disk B before the management tool sees (7). This means that the management tool will create the guest with the raid1 block device which no longer is the correct disk. This could fail in subtly bad ways. Depending on how read is implemented (if you try to do striping for instance), bad data could be returned. You could try to implement a policy of always reading from B if the block has been copied but this gets harry really quickly. It's definitely not RAID1 anymore. You may observe that the problem is not the RAID1 mechanism, but changing from using a normal device and the RAID1 mechanism. It would then be wise to say, let's always use this image format. Since that eliminates the race, we don't really need the copy bitmap anymore. Now we're left with a simple format that just refers to two filenames. However, block devices are more than just a filename. It needs a format, cache settings, etc. So let's put this all in the RAID1 block format. We also need a way to indicate which block device is selected. Let's make it a text file for purposes of discussion. It will look something like: [primary] filename=diska.img cache=none format=raw [secondary] filename=diskb.img cache=writethrough format=qcow2 [global] active=primary Since we might want to mirror multiple drives at once, we should probablyn support having multiple drives configured which means we need to not just have a single active entry, but an entry associated with a particular device. [drive "diskA"] filename=diska.img cache=none format=raw [drive "diskB"] filename=diskb.img cache=writethrough format=qcow2 [device "vda"] drive=diskB And this is exactly what I'm proposing. It's really the natural generalization of what you're proposing. So basically, the only differences are: 1) always use the new RAID1 format 2) drop the progress bitmap 3) support multiple devices per file 4) let drive properties be specified beyond filename All reasonable things to do. Regards, Anthony Liguori
On 03/01/2011 05:51 PM, Anthony Liguori wrote: > On 03/01/2011 04:39 AM, Avi Kivity wrote: >> On 02/28/2011 08:12 PM, Anthony Liguori wrote: >>> >>> >>> On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com >>> <mailto:avi@redhat.com>> wrote: >>> > >>> > On 02/28/2011 07:33 PM, Anthony Liguori wrote: >>> >> >>> >> >>> >> > >>> >> > You're just ignoring what I've written. >>> >> >>> >> No, you're just impervious to my subtle attempt to refocus the >>> discussion on solving a practical problem. >>> >> >>> >> There's a lot of good, reasonably straight forward changes we can >>> make that have a high return on investment. >>> >> >>> > >>> > Is making qemu the authoritative source of configuration >>> information a straightforward change? Is the return on it high? Is >>> the investment low? >>> >>> I think this is where we fundamentally disagree. My position is that >>> QEMU is already the authoritative source. Having a state file doesn't >>> change anything. >>> >>> Do a hot unplug of a network device with upstream libvirt with >>> acpiphp unloaded, consult libvirt and then consult the monitor to see >>> who has the right view of the guests config. >>> >> >> libvirt is right and the monitor is wrong. >> >> On real hardware, calling _EJ0 doesn't affect the configuration one >> little bit (if I understand it correctly). It just turns off power to >> the slot. If you power-cycle, the card will be there. > > It's up to the hardware vendor. Since it's ACPI, it can result in any > number of operations. Usually, there's some logic to flip on an LED or > something. > > There's nothing that prevents a vendor from ejecting the card. My point > is that there aren't cleanly separated lines in the real world. > >>> To me, that's the definition of authoritative. >>> >>> > "No" to all three (ignoring for the moment whether it is good or >>> not, which we were debating). >>> > >>> > >>> >> The only suggestion I'm making beyond Marcelo's original patch is >>> that we use a structured format and that we make it possible to use >>> the same file to solve this problem in multiple places. >>> >> >>> > >>> > No, you're suggesting a lot more than that. >>> >>> That's exactly what I'm suggesting from a technical perspective. >>> >> >> Unless I'm hallucinating, you're suggesting quite a bit more. A >> revolution in how qemu is to be managed. > > Let me take another route to see if I can't persuade you. > > First, let's clarify your proposal. You want to introduce a new block > format No. That was Avi's initial proposal, after we talked we realized that it is not needed and we can use plain files w/o any new configuration. Pretty much similar to what you're proposing below, just w/o the configuration files. > that references to block devices. It may also store a dirty bitmap to keep > track of which blocks are out of sync. Hopefully, it goes without saying > that the dirty bitmap is strictly optional (it's a performance > optimization) so > let's ignore it. > > Your format, as a text file, looks like: > > [raid1] > primary=diska.img > secondary=diskb.img > active=primary > > To use it, here's the sequence: > > 0) qemu uses disk A for a block device > > 1) create a raid1 block device pointing to disk A and disk B. > > 2) management tool asks qemu to us the new raid1 block device. > > 3) qemu acks (2) > > 4) at some point, the mirror completes, writes are going to both disks > > 5) qemu sends out an event indicating that the disks are in sync > > 6) management tool then sends a command to fail over to disk B > > 7) qemu acks (6) 7) is not a must when there is no raid. > > We're making the management tool the "authoritative" source of how to > launch > QEMU. That means that the management tool ultimately determines which > command > line to relaunch QEMU with. This is what we have today regardless of live copy. How else would you track many hot plug/unplug operations and live migration afterwards? For enterprise usage, that's the best case. It's also true for a single host w/ libvirt and virt-manager. > > Here are the races: > > A) If QEMU crashes between (2) and (3), it may have issues a write to > the new > raid1 block device before the management tool sees (3). If this happens, > when the management tool restarts QEMU with disk A, we're left with a > dangling raid1 block device. Not a critical failure, but not ideal. Once there is no raid there is no race. > > B) If QEMU crashes between (6) and (7), QEMU may have started writing to > disk > B before the management tool sees (7). This means that the management tool > will create the guest with the raid1 block device which no longer is the > correct disk. This could fail in subtly bad ways. Depending on how read > is implemented (if you try to do striping for instance), bad data could be > returned. You could try to implement a policy of always reading from B if > the block has been copied but this gets harry really quickly. It's > definitely not RAID1 anymore. Exactly! Drop the raid and always read from B post #6. This is what I was suggesting before. > > You may observe that the problem is not the RAID1 mechanism, but > changing from > using a normal device and the RAID1 mechanism. It would then be wise to > say, > let's always use this image format. Since that eliminates the race, we > don't > really need the copy bitmap anymore. > > Now we're left with a simple format that just refers to two filenames. Ok, looks good. A management app won't need the files below since it manages everything by its own. > However, > block devices are more than just a filename. It needs a format, cache > settings, etc. So let's put this all in the RAID1 block format. We also > need > a way to indicate which block device is selected. > > Let's make it a text file for purposes of discussion. It will look > something > like: > > [primary] > filename=diska.img > cache=none > format=raw > > [secondary] > filename=diskb.img > cache=writethrough > format=qcow2 > > [global] > active=primary > > Since we might want to mirror multiple drives at once, we should probablyn > support having multiple drives configured which means we need to not > just have > a single active entry, but an entry associated with a particular device. > > [drive "diskA"] > filename=diska.img > cache=none > format=raw > > [drive "diskB"] > filename=diskb.img > cache=writethrough > format=qcow2 > > [device "vda"] > drive=diskB > > And this is exactly what I'm proposing. It's really the natural > generalization > of what you're proposing. > > So basically, the only differences are: > > 1) always use the new RAID1 format > 2) drop the progress bitmap > 3) support multiple devices per file > 4) let drive properties be specified beyond filename > > All reasonable things to do. > > Regards, > > Anthony Liguori >
On 03/01/2011 03:59 AM, Dor Laor wrote: > On 02/28/2011 08:12 PM, Anthony Liguori wrote: >> >> On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com >> <mailto:avi@redhat.com>> wrote: >> > >> > On 02/28/2011 07:33 PM, Anthony Liguori wrote: >> >> >> >> >> >> > >> >> > You're just ignoring what I've written. >> >> >> >> No, you're just impervious to my subtle attempt to refocus the >> discussion on solving a practical problem. >> >> >> >> There's a lot of good, reasonably straight forward changes we can >> make that have a high return on investment. >> >> >> > >> > Is making qemu the authoritative source of configuration information >> a straightforward change? Is the return on it high? Is the >> investment low? >> >> I think this is where we fundamentally disagree. My position is that >> QEMU is already the authoritative source. Having a state file doesn't >> change anything. >> >> Do a hot unplug of a network device with upstream libvirt with acpiphp >> unloaded, consult libvirt and then consult the monitor to see who has >> the right view of the guests config. >> >> To me, that's the definition of authoritative. >> >> > "No" to all three (ignoring for the moment whether it is good or not, >> which we were debating). >> > >> > >> >> The only suggestion I'm making beyond Marcelo's original patch is >> that we use a structured format and that we make it possible to use the >> same file to solve this problem in multiple places. >> >> >> > >> > No, you're suggesting a lot more than that. >> >> That's exactly what I'm suggesting from a technical perspective. >> >> >> I don't think this creates a fundamental break in how management >> tools interact with QEMU. I don't think introducing RAID support in the >> block layer is a reasonable alternative. >> >> >> >> >> > >> > Why not? >> >> Because its a lot of complexity and code that can go wrong while only >> solving the race for one specific case. Not to mention that we double >> the iop rate. >> >> > Something that avoids the whole state thing altogether: >> > >> > - instead of atomically switching when live copy is done, keep on >> issuing writes to both the origin and the live copy >> > - issue a notification to management >> > - management receives the notification, and issues an atomic blockdev >> switch command >> >> > this is really the RAID-1 solution but without the state file (credit >> Dor). An advantage is that there is no additional latency when trying >> to catch up to the dirty bitmap. >> >> It still suffers from the two generals problem. You cannot solve this >> without making one node reliable and that takes us back to it being >> either QEMU (posted event and state file) or the management tool (sync >> event). > > It is safe w/o a state file by changing the basic live copy algorithm: > > 1. Live copy in progress stage > Once live copy command is issued, a dirty bit map is created for > tracking. There is a single pass over the entire image where we copy > blocks from the src to the dst. > > Write commands for blocks that were already copied will be done > twice for the src and dst. > > Once the full copy single pass ends, we trigger a QMP event that > this stage can end. > > The live copy stage keeps running till the management issue a switch > command. When it will happen, the switch is immediate and no need to > copy additional blocks (but flush pending IOs). > > 2. Management sends a switch command. > Qemu stops the doubling the IO and switches to the destination. > End. Here is where your race is: 2. Management sends a switch command 3. QEMU receives switch command 4. QEMU stops doubling IO and switches to the destination 5. QEMU sends acknowledgement of switch command 6. Management receives acknowledge of switch command 7. Management changes internal state definition to reflect the new destination If QEMU or the management tool crashes after step 4 and before step 6, when the management tool restarts QEMU with the source image, data loss will have occurred (and potentially corruption if a flush had happened). This all boils down to the Two Generals Problem[1]. It's simply not fixable without making one end reliable and that means that someone needs to fsync() something *after* the switchover happens but before the first write happens. That can be QEMU (Avi's RAID proposal and my state file proposal) or it can be the management tool (if we introduce synchronous events). [1] http://en.wikipedia.org/wiki/Two_Generals%27_Problem Regards, Anthony Liguori
On 03/02/2011 02:39 PM, Anthony Liguori wrote: > > Here is where your race is: > > 2. Management sends a switch command > > 3. QEMU receives switch command > > 4. QEMU stops doubling IO and switches to the destination > > 5. QEMU sends acknowledgement of switch command > > 6. Management receives acknowledge of switch command > > 7. Management changes internal state definition to reflect the new > destination > > If QEMU or the management tool crashes after step 4 and before step 6, > when the management tool restarts QEMU with the source image, data > loss will have occurred (and potentially corruption if a flush had > happened). No. After step 2, any qemu restart will be with the destination image. If the management tool restarts, it can query the state (or just re-issue the switch command, which is idempotent). > > This all boils down to the Two Generals Problem[1]. It's simply not > fixable without making one end reliable and that means that someone > needs to fsync() something *after* the switchover happens but before > the first write happens. That can be QEMU (Avi's RAID proposal and my > state file proposal) or it can be the management tool (if we introduce > synchronous events). The two problems are not equivalent. Once the management tool receives acknowledgement that the switch occurred, the protocol terminates.
On 03/02/2011 08:00 AM, Avi Kivity wrote: > On 03/02/2011 02:39 PM, Anthony Liguori wrote: >> >> Here is where your race is: >> >> 2. Management sends a switch command >> >> 3. QEMU receives switch command >> >> 4. QEMU stops doubling IO and switches to the destination >> >> 5. QEMU sends acknowledgement of switch command >> >> 6. Management receives acknowledge of switch command >> >> 7. Management changes internal state definition to reflect the new >> destination >> >> If QEMU or the management tool crashes after step 4 and before step >> 6, when the management tool restarts QEMU with the source image, data >> loss will have occurred (and potentially corruption if a flush had >> happened). > > No. After step 2, any qemu restart will be with the destination > image. If the management tool restarts, it can query the state (or > just re-issue the switch command, which is idempotent). Yeah, I think you're right, although I need to think through it a bit more. Regards, Anthony Liguori
On 03/01/2011 05:51 PM, Anthony Liguori wrote: >>> Do a hot unplug of a network device with upstream libvirt with >>> acpiphp unloaded, consult libvirt and then consult the monitor to >>> see who has the right view of the guests config. >>> >> >> libvirt is right and the monitor is wrong. >> >> On real hardware, calling _EJ0 doesn't affect the configuration one >> little bit (if I understand it correctly). It just turns off power >> to the slot. If you power-cycle, the card will be there. > > It's up to the hardware vendor. Since it's ACPI, it can result in any > number of operations. Usually, there's some logic to flip on an LED > or something. > > There's nothing that prevents a vendor from ejecting the card. My > point is that there aren't cleanly separated lines in the real world. We can implement out virtual hardware like real hardware, or we can do some new stuff, and break our management model in the process. >>> >> >> Unless I'm hallucinating, you're suggesting quite a bit more. A >> revolution in how qemu is to be managed. > > Let me take another route to see if I can't persuade you. > > First, let's clarify your proposal. You want to introduce a new block > format > that references to block devices. It may also store a dirty bitmap to > keep > track of which blocks are out of sync. Hopefully, it goes without saying > that the dirty bitmap is strictly optional (it's a performance > optimization) so > let's ignore it. (as was related elsewhere, the state is also optional) > > Your format, as a text file, looks like: > > [raid1] > primary=diska.img > secondary=diskb.img > active=primary > > To use it, here's the sequence: > > 0) qemu uses disk A for a block device > > 1) create a raid1 block device pointing to disk A and disk B. > > 2) management tool asks qemu to us the new raid1 block device. > > 3) qemu acks (2) > > 4) at some point, the mirror completes, writes are going to both disks > > 5) qemu sends out an event indicating that the disks are in sync > > 6) management tool then sends a command to fail over to disk B > > 7) qemu acks (6) > > We're making the management tool the "authoritative" source of how to > launch > QEMU. That means that the management tool ultimately determines which > command > line to relaunch QEMU with. > > Here are the races: > > A) If QEMU crashes between (2) and (3), it may have issues a write to > the new > raid1 block device before the management tool sees (3). If this > happens, > when the management tool restarts QEMU with disk A, we're left with a > dangling raid1 block device. Not a critical failure, but not ideal. You can restart qemu with the RAID1 blockdev. > > B) If QEMU crashes between (6) and (7), QEMU may have started writing > to disk > B before the management tool sees (7). This means that the > management tool > will create the guest with the raid1 block device which no longer > is the > correct disk. This could fail in subtly bad ways. Depending on > how read > is implemented (if you try to do striping for instance), bad data > could be > returned. You could try to implement a policy of always reading > from B if > the block has been copied but this gets harry really quickly. It's > definitely not RAID1 anymore. As related elsewhere, you restart qemu with image B. The trick is to partition the problem into idempotent commands; these allow you to recover from any failure. > > You may observe that the problem is not the RAID1 mechanism, but > changing from > using a normal device and the RAID1 mechanism. It would then be wise > to say, > let's always use this image format. Since that eliminates the race, > we don't > really need the copy bitmap anymore. > > Now we're left with a simple format that just refers to two > filenames. However, > block devices are more than just a filename. It needs a format, cache > settings, etc. So let's put this all in the RAID1 block format. We > also need > a way to indicate which block device is selected. > > Let's make it a text file for purposes of discussion. It will look > something > like: > > [primary] > filename=diska.img > cache=none > format=raw > > [secondary] > filename=diskb.img > cache=writethrough > format=qcow2 > > [global] > active=primary > > Since we might want to mirror multiple drives at once, we should > probablyn > support having multiple drives configured which means we need to not > just have > a single active entry, but an entry associated with a particular device. Or you have one file per RAID-1 image set. This is important because images are not associated with a qemu instance. You can hot-unplug an image from one qemu and hot-plug it into another. > > [drive "diskA"] > filename=diska.img > cache=none > format=raw > > [drive "diskB"] > filename=diskb.img > cache=writethrough > format=qcow2 > > [device "vda"] > drive=diskB > > And this is exactly what I'm proposing. It's exactly what I'm opposing. Making qemu manage all this stuff. > It's really the natural generalization > of what you're proposing. > > So basically, the only differences are: > > 1) always use the new RAID1 format > 2) drop the progress bitmap > 3) support multiple devices per file > 4) let drive properties be specified beyond filename > > All reasonable things to do. Well, I dislike 3, and the whole "qemu is authoritative source of configuration" thing.
On 03/02/2011 11:30 AM, Avi Kivity wrote: >> It's really the natural generalization >> of what you're proposing. >> >> So basically, the only differences are: >> >> 1) always use the new RAID1 format >> 2) drop the progress bitmap >> 3) support multiple devices per file >> 4) let drive properties be specified beyond filename >> >> All reasonable things to do. > > Well, I dislike 3, and the whole "qemu is authoritative source of > configuration" thing. Yeah, at this point, I think the new no-stored-state approach is obviously better than anything proposed. So we'll have to revisit this discussion if/when there's another use-case that demands it. Regards, Anthony Liguori
Index: qemu/block-copy.c =================================================================== --- /dev/null +++ qemu/block-copy.c @@ -0,0 +1,741 @@ +/* + * QEMU live block copy + * + * Copyright (C) 2010 Red Hat Inc. + * + * Authors: Marcelo Tosatti <mtosatti@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block_int.h" +#include "blockdev.h" +#include "qemu-queue.h" +#include "qemu-timer.h" +#include "monitor.h" +#include "block-copy.h" +#include "migration.h" +#include "sysemu.h" +#include "qjson.h" +#include <assert.h> + +#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) +#define MAX_IS_ALLOCATED_SEARCH 65536 + +/* + * Stages: + * + * STAGE_BULK: bulk reads/writes in progress + * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress + * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress + * STAGE_SWITCH_FINISHED: switched to new image. + */ + +enum BdrvCopyStage { + STAGE_BULK, + STAGE_BULK_FINISHED, + STAGE_DIRTY, + STAGE_SWITCH_FINISHED, +}; + +typedef struct BdrvCopyState { + BlockDriverState *src; + BlockDriverState *dst; + bool shared_base; + + int64_t curr_sector; + int64_t completed_sectors; + int64_t nr_sectors; + + enum BdrvCopyStage stage; + int inflight_reads; + int error; + int failed; + int cancelled; + QLIST_HEAD(, BdrvCopyBlock) io_list; + unsigned long *aio_bitmap; + QEMUTimer *aio_timer; + QLIST_ENTRY(BdrvCopyState) list; + + int64_t blocks; + int64_t total_time; + + char src_device_name[32]; + char dst_filename[1024]; + int commit_fd; +} BdrvCopyState; + +typedef struct BdrvCopyBlock { + BdrvCopyState *state; + uint8_t *buf; + int64_t sector; + int64_t nr_sectors; + struct iovec iov; + QEMUIOVector qiov; + BlockDriverAIOCB *aiocb; + int64_t time; + QLIST_ENTRY(BdrvCopyBlock) list; +} BdrvCopyBlock; + +static QLIST_HEAD(, BdrvCopyState) block_copy_list = + QLIST_HEAD_INITIALIZER(block_copy_list); + +static void alloc_aio_bitmap(BdrvCopyState *s) +{ + BlockDriverState *bs = s->src; + int64_t bitmap_size; + + bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; + bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; + + s->aio_bitmap = qemu_mallocz(bitmap_size); +} + +static bool aio_inflight(BdrvCopyState *s, int64_t sector) +{ + int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; + + if (s->aio_bitmap && + (sector << BDRV_SECTOR_BITS) < bdrv_getlength(s->src)) { + return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & + (1UL << (chunk % (sizeof(unsigned long) * 8)))); + } else { + return 0; + } +} + +static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num, + int nb_sectors, int set) +{ + int64_t start, end; + unsigned long val, idx, bit; + + start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; + end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; + + for (; start <= end; start++) { + idx = start / (sizeof(unsigned long) * 8); + bit = start % (sizeof(unsigned long) * 8); + val = s->aio_bitmap[idx]; + if (set) { + if (!(val & (1UL << bit))) { + val |= 1UL << bit; + } + } else { + if (val & (1UL << bit)) { + val &= ~(1UL << bit); + } + } + s->aio_bitmap[idx] = val; + } +} + +static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage) +{ + s->stage = stage; + + switch (stage) { + case STAGE_BULK: + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK); + break; + case STAGE_BULK_FINISHED: + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED); + break; + case STAGE_DIRTY: + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY); + break; + case STAGE_SWITCH_FINISHED: + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED); + break; + default: + break; + } +} + +static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret) +{ + s->error = ret; + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); +} + +static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time) +{ + s->blocks++; + s->total_time += time; +} + +static void blk_copy_write_cb(void *opaque, int ret) +{ + BdrvCopyBlock *blk = opaque; + BdrvCopyState *s = blk->state; + + if (ret < 0) { + QLIST_REMOVE(blk, list); + qemu_free(blk->buf); + qemu_free(blk); + blk_copy_handle_cb_error(s, ret); + return; + } + + QLIST_REMOVE(blk, list); + add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time); + + /* schedule switch to STAGE_DIRTY on last bulk write completion */ + if (blk->state->stage == STAGE_BULK_FINISHED) { + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); + } + + if (blk->state->stage > STAGE_BULK_FINISHED) { + set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0); + } + + qemu_free(blk->buf); + qemu_free(blk); +} + +static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk) +{ + BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock)); + blk->state = s; + blk->sector = read_blk->sector; + blk->nr_sectors = read_blk->nr_sectors; + blk->time = read_blk->time; + blk->buf = read_blk->buf; + QLIST_INSERT_HEAD(&s->io_list, blk, list); + + blk->iov.iov_base = read_blk->buf; + blk->iov.iov_len = read_blk->iov.iov_len; + qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); + + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE); + blk->aiocb = bdrv_aio_writev(s->dst, blk->sector, &blk->qiov, + blk->iov.iov_len / BDRV_SECTOR_SIZE, + blk_copy_write_cb, blk); + if (!blk->aiocb) { + s->error = 1; + goto error; + } + + return; + +error: + QLIST_REMOVE(blk, list); + qemu_free(read_blk->buf); + qemu_free(blk); +} + +static void blk_copy_read_cb(void *opaque, int ret) +{ + BdrvCopyBlock *blk = opaque; + BdrvCopyState *s = blk->state; + + s->inflight_reads--; + if (ret < 0) { + QLIST_REMOVE(blk, list); + qemu_free(blk->buf); + qemu_free(blk); + blk_copy_handle_cb_error(s, ret); + return; + } + blk_copy_issue_write(s, blk); + QLIST_REMOVE(blk, list); + qemu_free(blk); + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); +} + +static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector, + int nr_sectors) +{ + BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock)); + blk->buf = qemu_mallocz(BLOCK_SIZE); + blk->state = s; + blk->sector = sector; + blk->nr_sectors = nr_sectors; + QLIST_INSERT_HEAD(&s->io_list, blk, list); + + blk->iov.iov_base = blk->buf; + blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; + qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); + + s->inflight_reads++; + blk->time = qemu_get_clock_ns(rt_clock); + blk->aiocb = bdrv_aio_readv(s->src, sector, &blk->qiov, nr_sectors, + blk_copy_read_cb, blk); + if (!blk->aiocb) { + s->error = 1; + goto error; + } + + return; + +error: + s->inflight_reads--; + QLIST_REMOVE(blk, list); + qemu_free(blk->buf); + qemu_free(blk); +} + +static bool blkcopy_can_switch(BdrvCopyState *s) +{ + int64_t remaining_dirty; + int64_t avg_transfer_time; + + remaining_dirty = bdrv_get_dirty_count(s->src); + if (remaining_dirty == 0 || s->blocks == 0) { + return true; + } + + avg_transfer_time = s->total_time / s->blocks; + if ((remaining_dirty * avg_transfer_time) <= migrate_max_downtime()) { + return true; + } + return false; +} + +static int blk_issue_reads_dirty(BdrvCopyState *s) +{ + int64_t sector; + + for (sector = s->curr_sector; sector < s->nr_sectors;) { + if (bdrv_get_dirty(s->src, sector) && !aio_inflight(s, sector)) { + int nr_sectors = MIN(s->nr_sectors - s->curr_sector, + BDRV_SECTORS_PER_DIRTY_CHUNK); + + blk_copy_issue_read(s, sector, nr_sectors); + bdrv_reset_dirty(s->src, sector, nr_sectors); + set_aio_inflight(s, sector, nr_sectors, 1); + break; + } + + sector += BDRV_SECTORS_PER_DIRTY_CHUNK; + s->curr_sector = sector; + } + + if (sector >= s->nr_sectors) { + s->curr_sector = 0; + } + return 0; +} + +static int blk_issue_reads_bulk(BdrvCopyState *s) +{ + int nr_sectors; + int64_t curr_sector = s->curr_sector; + + if (s->shared_base) { + while (curr_sector < s->nr_sectors && + !bdrv_is_allocated(s->src, curr_sector, + MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { + curr_sector += nr_sectors; + } + } + + if (curr_sector >= s->nr_sectors) { + s->curr_sector = 0; + return 1; + } + + curr_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); + nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; + + blk_copy_issue_read(s, s->curr_sector, nr_sectors); + s->curr_sector += nr_sectors; + s->completed_sectors = curr_sector; + return 0; +} + +static void blkcopy_finish(BdrvCopyState *s) +{ + int64_t sector; + uint8_t *buf; + + buf = qemu_malloc(BLOCK_SIZE); + + /* FIXME: speed up loop, get_next_dirty_block? */ + for (sector = 0; sector < s->nr_sectors; + sector += BDRV_SECTORS_PER_DIRTY_CHUNK) { + if (bdrv_get_dirty(s->src, sector)) { + int nr_sectors = MIN(s->nr_sectors - sector, + BDRV_SECTORS_PER_DIRTY_CHUNK); + + memset(buf, 0, BLOCK_SIZE); + if (bdrv_read(s->src, sector, buf, nr_sectors) < 0) { + goto error; + } + if (bdrv_write(s->dst, sector, buf, nr_sectors) < 0) { + goto error; + } + bdrv_reset_dirty(s->src, sector, nr_sectors); + } + + if (bdrv_get_dirty_count(s->src) == 0) + break; + } + qemu_free(buf); + return; + +error: + qemu_free(buf); + s->error = 1; +} + +static int write_commit_file(BdrvCopyState *s) +{ + char commit_msg[1400]; + const char *buf = commit_msg; + int len, ret; + + sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name, + s->dst_filename); + + len = strlen(commit_msg); + while (len > 0) { + ret = write(s->commit_fd, buf, len); + if (ret == -1 && errno == EINTR) { + continue; + } + if (ret <= 0) { + return -errno; + } + buf += ret; + len -= ret; + } + + if (fsync(s->commit_fd) == -1) { + return -errno; + } + + return 0; +} + +static void blkcopy_cleanup(BdrvCopyState *s) +{ + assert(s->inflight_reads == 0); + assert(QLIST_EMPTY(&s->io_list)); + bdrv_set_dirty_tracking(s->src, 0); + drive_put_ref(drive_get_by_blockdev(s->src)); + bdrv_set_in_use(s->src, 0); + if (s->stage >= STAGE_DIRTY) + qemu_free(s->aio_bitmap); + qemu_del_timer(s->aio_timer); + if (s->commit_fd) + close(s->commit_fd); +} + +static void blkcopy_free(BdrvCopyState *s) +{ + QLIST_REMOVE(s, list); + qemu_free(s); +} + +static void handle_error(BdrvCopyState *s) +{ + if (!QLIST_EMPTY(&s->io_list)) + return; + s->failed = 1; + blkcopy_cleanup(s); +} + +static void blkcopy_switch(BdrvCopyState *s) +{ + char src_filename[1024]; + int open_flags; + + strncpy(src_filename, s->src->filename, sizeof(src_filename)); + open_flags = s->src->open_flags; + + assert(s->stage == STAGE_DIRTY); + + vm_stop(VMSTOP_BLOCKCOPY); + /* flush any guest writes, dirty bitmap uptodate after this. + * copy AIO also finished. + */ + qemu_aio_flush(); + assert(QLIST_EMPTY(&s->io_list)); + if (s->error) { + handle_error(s); + goto vm_start; + } + blkcopy_finish(s); + if (s->error) { + handle_error(s); + goto vm_start; + } + assert(bdrv_get_dirty_count(s->src) == 0); + bdrv_flush_all(); + + bdrv_close(s->src); + bdrv_close(s->dst); + if (bdrv_open(s->src, s->dst->filename, s->src->open_flags, NULL) < 0) { + s->failed = 1; + goto err; + } + + if (s->commit_fd && write_commit_file(s)) { + s->failed = 1; + bdrv_close(s->src); + goto err; + } + + blkcopy_set_stage(s, STAGE_SWITCH_FINISHED); + blkcopy_cleanup(s); +vm_start: + vm_start(); + return; + +err: + if (bdrv_open(s->src, src_filename, open_flags, NULL) < 0) { + error_report("%s: %s: cannot fallback to source image\n", __func__, + s->dst_filename); + abort(); + } + blkcopy_cleanup(s); + goto vm_start; +} + +#define BLKCOPY_INFLIGHT 2 + +/* + * To simplify the implementation, the IO completion callbacks do not + * handle stage control or submit IO for further blocks. A timer is used + * for such purpose. + */ + +static void aio_timer(void *opaque) +{ + BdrvCopyState *s = opaque; + + assert(s->cancelled == 0); + + if (s->error) { + handle_error(s); + return; + } + + while (s->stage == STAGE_BULK) { + if (s->inflight_reads >= BLKCOPY_INFLIGHT) { + break; + } + if (blk_issue_reads_bulk(s)) { + blkcopy_set_stage(s, STAGE_BULK_FINISHED); + } + } + + if (s->stage == STAGE_BULK_FINISHED) { + if (QLIST_EMPTY(&s->io_list)) { + blkcopy_set_stage(s, STAGE_DIRTY); + alloc_aio_bitmap(s); + } + } + + while (s->stage == STAGE_DIRTY) { + if (s->inflight_reads >= BLKCOPY_INFLIGHT) { + break; + } + blk_issue_reads_dirty(s); + if (blkcopy_can_switch(s)) { + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_SWITCH_START); + blkcopy_switch(s); + return; + } + } +} + +static int bdrv_copy(Monitor *mon, const char * device, BlockDriverState *src, + BlockDriverState *dst, const char *commit_file, + bool shared_base) +{ + int64_t sectors; + BdrvCopyState *blkcopy, *safe; + int f; + + QLIST_FOREACH_SAFE(blkcopy, &block_copy_list, list, safe) { + if (!strcmp(blkcopy->src_device_name, src->device_name)) { + if (blkcopy->stage == STAGE_SWITCH_FINISHED || blkcopy->failed) { + blkcopy_free(blkcopy); + } else { + qerror_report(QERR_BLOCKCOPY_IN_PROGRESS, src->device_name); + return -1; + } + } + } + + sectors = bdrv_getlength(src) >> BDRV_SECTOR_BITS; + if (sectors != bdrv_getlength(dst) >> BDRV_SECTOR_BITS) { + qerror_report(QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS); + return -1; + } + + if (commit_file) { + f = open(commit_file, O_CREAT|O_WRONLY, S_IRUSR); + if (f == -1) { + qerror_report(QERR_OPEN_FILE_FAILED, commit_file); + return -1; + } + } + + blkcopy = qemu_mallocz(sizeof(BdrvCopyState)); + blkcopy->src = src; + blkcopy->dst = dst; + blkcopy->curr_sector = 0; + blkcopy->nr_sectors = sectors; + blkcopy_set_stage(blkcopy, STAGE_BULK); + blkcopy->aio_timer = qemu_new_timer(rt_clock, aio_timer, blkcopy); + blkcopy->shared_base = shared_base; + blkcopy->commit_fd = f; + strncpy(blkcopy->src_device_name, blkcopy->src->device_name, + sizeof(blkcopy->src_device_name) - 1); + strncpy(blkcopy->dst_filename, blkcopy->dst->filename, + sizeof(blkcopy->dst_filename) - 1); + + bdrv_set_dirty_tracking(src, 1); + qemu_mod_timer(blkcopy->aio_timer, qemu_get_clock(rt_clock)); + drive_get_ref(drive_get_by_blockdev(src)); + bdrv_set_in_use(src, 1); + + QLIST_INSERT_HEAD(&block_copy_list, blkcopy, list); + return 0; +} + +int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data) +{ + const char *device = qdict_get_str(qdict, "device"); + const char *filename = qdict_get_str(qdict, "filename"); + const char *commit_file = qdict_get_try_str(qdict, "commit_filename"); + bool shared_base = qdict_get_try_bool(qdict, "inc", 0); + BlockDriverState *new_bs, *bs; + int ret; + + if (migration_active()) { + qerror_report(QERR_MIGRATION_IN_PROGRESS); + return -1; + } + + bs = bdrv_find(device); + if (!bs) { + qerror_report(QERR_DEVICE_NOT_FOUND, device); + return -1; + } + + new_bs = bdrv_new(""); + if (bdrv_open(new_bs, filename, bs->open_flags, NULL) < 0) { + bdrv_delete(new_bs); + qerror_report(QERR_OPEN_FILE_FAILED, filename); + return -1; + } + + ret = bdrv_copy(mon, device, bs, new_bs, commit_file, shared_base); + return ret; +} + +int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data) +{ + BdrvCopyState *blkcopy, *s = NULL; + const char *device = qdict_get_str(qdict, "device"); + + QLIST_FOREACH(blkcopy, &block_copy_list, list) { + if (!strcmp(blkcopy->src_device_name, device)) { + s = blkcopy; + break; + } + } + + if (!s) { + qerror_report(QERR_DEVICE_NOT_FOUND, device); + return -1; + } + + s->cancelled = 1; + do { + qemu_aio_flush(); + } while (!QLIST_EMPTY(&s->io_list)); + blkcopy_cleanup(s); + blkcopy_free(s); + + return 0; +} + +static void blockcopy_print_dict(QObject *obj, void *opaque) +{ + QDict *c_dict; + Monitor *mon = opaque; + + c_dict = qobject_to_qdict(obj); + + monitor_printf(mon, "%s: status=%s ", + qdict_get_str(c_dict, "device"), + qdict_get_str(c_dict, "status")); + + if (qdict_haskey(c_dict, "info")) { + QDict *qdict = qobject_to_qdict(qdict_get(c_dict, "info")); + + monitor_printf(mon, "percentage=%ld %%", + qdict_get_int(qdict, "percentage")); + } + + monitor_printf(mon, "\n"); +} + +void do_info_blockcopy_print(Monitor *mon, const QObject *data) +{ + qlist_iter(qobject_to_qlist(data), blockcopy_print_dict, mon); +} + +void do_info_blockcopy(Monitor *mon, QObject **ret_data) +{ + QList *c_list; + BdrvCopyState *s; + + c_list = qlist_new(); + + QLIST_FOREACH(s, &block_copy_list, list) { + QObject *c_obj; + static const char *status[] = { "failed", "active", "completed" }; + int i; + + if (s->failed) { + i = 0; + } else if (s->stage < STAGE_SWITCH_FINISHED) { + i = 1; + } else { + i = 2; + } + + c_obj = qobject_from_jsonf("{ 'device': %s, 'status': %s }", + s->src_device_name, status[i]); + + if (i == 1) { + QDict *dict = qobject_to_qdict(c_obj); + QObject *obj; + + /* FIXME: add dirty stage progress? */ + obj = qobject_from_jsonf("{ 'percentage': %" PRId64 "}", + s->completed_sectors * 100 / s->nr_sectors); + qdict_put_obj(dict, "info", obj); + } + qlist_append_obj(c_list, c_obj); + } + + *ret_data = QOBJECT(c_list); +} + +bool block_copy_active(void) +{ + BdrvCopyState *s; + + QLIST_FOREACH(s, &block_copy_list, list) { + if (s->failed) { + continue; + } + if (s->stage < STAGE_SWITCH_FINISHED) { + return true; + } + } + + return false; +} + Index: qemu/block-copy.h =================================================================== --- /dev/null +++ qemu/block-copy.h @@ -0,0 +1,25 @@ +/* + * QEMU live block copy + * + * Copyright (C) 2010 Red Hat Inc. + * + * Authors: Marcelo Tosatti <mtosatti@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef BLOCK_COPY_H +#define BLOCK_COPY_H + +int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data); +int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data); + +void do_info_blockcopy_print(Monitor *mon, const QObject *data); +void do_info_blockcopy(Monitor *mon, QObject **ret_data); + +bool block_copy_active(void); + +#endif /* BLOCK_COPY_H */ + Index: qemu/hmp-commands.hx =================================================================== --- qemu.orig/hmp-commands.hx +++ qemu/hmp-commands.hx @@ -806,6 +806,43 @@ Set maximum speed to @var{value} (in byt ETEXI { + .name = "block_copy", + .args_type = "device:s,filename:s,commit_filename:s?,inc:-i", + .params = "device filename [commit_filename] [-i]", + .help = "live block copy device to image" + "\n\t\t\t optional commit filename " + "\n\t\t\t -i for incremental copy " + "(base image shared between src and destination)", + .user_print = monitor_user_noop, + .mhandler.cmd_new = do_bdrv_copy, + }, + +STEXI +@item block_copy @var{device} @var{filename} [@var{commit_filename}] [-i] +@findex block_copy +Live copy block device @var{device} to image @var{filename}. + -i for incremental copy (base image is shared) + +Optionally a commit message is written to @var{commit_filename} +once the switch to the new image is performed. +ETEXI + + { + .name = "block_copy_cancel", + .args_type = "device:s", + .params = "device", + .help = "cancel live block copy", + .user_print = monitor_user_noop, + .mhandler.cmd_new = do_bdrv_copy_cancel, + }, + +STEXI +@item block_copy_cancel @var{device} +@findex block_copy_cancel +Cancel live block copy on @var{device}. +ETEXI + + { .name = "migrate_set_downtime", .args_type = "value:T", .params = "value", @@ -1352,6 +1389,8 @@ show device tree show qdev device model list @item info roms show roms +@item info block-copy +show block copy status @end table ETEXI Index: qemu/monitor.c =================================================================== --- qemu.orig/monitor.c +++ qemu/monitor.c @@ -45,6 +45,7 @@ #include "balloon.h" #include "qemu-timer.h" #include "migration.h" +#include "block-copy.h" #include "kvm.h" #include "acl.h" #include "qint.h" @@ -3109,6 +3110,14 @@ static const mon_cmd_t info_cmds[] = { }, #endif { + .name = "block-copy", + .args_type = "", + .params = "", + .help = "show block copy status", + .user_print = do_info_blockcopy_print, + .mhandler.info_new = do_info_blockcopy, + }, + { .name = NULL, }, }; @@ -3250,6 +3259,14 @@ static const mon_cmd_t qmp_query_cmds[] .mhandler.info_async = do_info_balloon, .flags = MONITOR_CMD_ASYNC, }, + { + .name = "block-copy", + .args_type = "", + .params = "", + .help = "show block copy status", + .user_print = do_info_blockcopy_print, + .mhandler.info_new = do_info_blockcopy, + }, { /* NULL */ }, }; Index: qemu/block.h =================================================================== --- qemu.orig/block.h +++ qemu/block.h @@ -299,6 +299,13 @@ typedef enum { BLKDBG_CLUSTER_ALLOC_BYTES, BLKDBG_CLUSTER_FREE, + BLKDBG_BLKCOPY_STAGE_BULK, + BLKDBG_BLKCOPY_STAGE_BULK_FINISHED, + BLKDBG_BLKCOPY_STAGE_DIRTY, + BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED, + BLKDBG_BLKCOPY_SWITCH_START, + BLKDBG_BLKCOPY_AIO_WRITE, + BLKDBG_EVENT_MAX, } BlkDebugEvent; Index: qemu/block/blkdebug.c =================================================================== --- qemu.orig/block/blkdebug.c +++ qemu/block/blkdebug.c @@ -178,6 +178,14 @@ static const char *event_names[BLKDBG_EV [BLKDBG_CLUSTER_ALLOC] = "cluster_alloc", [BLKDBG_CLUSTER_ALLOC_BYTES] = "cluster_alloc_bytes", [BLKDBG_CLUSTER_FREE] = "cluster_free", + + + [BLKDBG_BLKCOPY_STAGE_BULK] = "blkcopy_stage_bulk", + [BLKDBG_BLKCOPY_STAGE_BULK_FINISHED] = "blkcopy_stage_bulk_finished", + [BLKDBG_BLKCOPY_STAGE_DIRTY] = "blkcopy_stage_dirty", + [BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED] = "blkcopy_stage_switch_finished", + [BLKDBG_BLKCOPY_SWITCH_START] = "blkcopy_switch_start", + [BLKDBG_BLKCOPY_AIO_WRITE] = "blkcopy_aio_write", }; static int get_event_by_name(const char *name, BlkDebugEvent *event) Index: qemu/qerror.c =================================================================== --- qemu.orig/qerror.c +++ qemu/qerror.c @@ -209,6 +209,18 @@ static const QErrorStringTable qerror_ta .error_fmt = QERR_VNC_SERVER_FAILED, .desc = "Could not start VNC server on %(target)", }, + { + .error_fmt = QERR_BLOCKCOPY_IN_PROGRESS, + .desc = "Block copy for %(device) in progress", + }, + { + .error_fmt = QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS, + .desc = "Length of destination image differs from source image", + }, + { + .error_fmt = QERR_MIGRATION_IN_PROGRESS, + .desc = "Migration in progress", + }, {} }; Index: qemu/qerror.h =================================================================== --- qemu.orig/qerror.h +++ qemu/qerror.h @@ -171,4 +171,13 @@ QError *qobject_to_qerror(const QObject #define QERR_VNC_SERVER_FAILED \ "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }" +#define QERR_BLOCKCOPY_IN_PROGRESS \ + "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }" + +#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \ + "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }" + +#define QERR_MIGRATION_IN_PROGRESS \ + "{ 'class': 'MigrationInProgress', 'data': {} }" + #endif /* QERROR_H */ Index: qemu/qmp-commands.hx =================================================================== --- qemu.orig/qmp-commands.hx +++ qemu/qmp-commands.hx @@ -581,6 +581,75 @@ Example: EQMP { + .name = "block_copy", + .args_type = "device:s,filename:s,commit_filename:s?,inc:-i", + .params = "device filename [commit_filename] [-i]", + .help = "live block copy device to image" + "\n\t\t\t optional commit filename " + "\n\t\t\t -i for incremental copy " + "(base image shared between src and destination)", + .user_print = monitor_user_noop, + .mhandler.cmd_new = do_bdrv_copy, + }, + +SQMP +block-copy +------- + +Live block copy. + +Arguments: + +- "device": device name (json-string) +- "filename": target image filename (json-string) +- "commit_filename": target commit filename (json-string, optional) +- "inc": incremental disk copy (json-bool, optional) + +Example: + +-> { "execute": "block_copy", + "arguments": { "device": "ide0-hd1", + "filename": "/mnt/new-disk.img", + "commit_filename: "/mnt/commit-new-disk.img" + } } + +<- { "return": {} } + +Notes: + +(1) The 'query-block-copy' command should be used to check block copy progress + and final result (this information is provided by the 'status' member) +(2) Boolean argument "inc" defaults to false + +EQMP + + { + .name = "block_copy_cancel", + .args_type = "device:s", + .params = "device", + .help = "cancel live block copy", + .user_print = monitor_user_noop, + .mhandler.cmd_new = do_bdrv_copy_cancel, + }, + +SQMP +block_copy_cancel +-------------- + +Cancel live block copy. + +Arguments: + +- device: device name (json-string) + +Example: + +-> { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } } +<- { "return": {} } + +EQMP + + { .name = "netdev_add", .args_type = "netdev:O", .params = "[user|tap|socket],id=str[,prop=value][,...]", @@ -1740,6 +1809,44 @@ Examples: EQMP SQMP +query-block-copy +------------- + +Live block copy status. + +Each block copy instance information is stored in a json-object and the returned +value is a json-array of all instances. + +Each json-object contains the following: + +- "device": device name (json-string) +- "status": block copy status (json-string) + - Possible values: "active", "failed", "completed" +- "info": A json-object with the statistics information, if status is "active": + - "percentage": percentage completed (json-int) + +Example: + +Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed: + +-> { "execute": "query-block-copy" } +<- { + "return":[ + {"device":"ide1-hd0", + "status":"active", + "info":{ + "percentage":23, + } + }, + {"device":"ide1-hd1", + "status":"failed" + } + ] + } + +EQMP + +SQMP query-balloon ------------- Index: qemu/Makefile.objs =================================================================== --- qemu.orig/Makefile.objs +++ qemu/Makefile.objs @@ -98,7 +98,7 @@ common-obj-y += buffered_file.o migratio common-obj-y += qemu-char.o savevm.o #aio.o common-obj-y += msmouse.o ps2.o common-obj-y += qdev.o qdev-properties.o -common-obj-y += block-migration.o +common-obj-y += block-migration.o block-copy.o common-obj-y += pflib.o common-obj-$(CONFIG_BRLAPI) += baum.o Index: qemu/sysemu.h =================================================================== --- qemu.orig/sysemu.h +++ qemu/sysemu.h @@ -46,6 +46,7 @@ void qemu_del_vm_change_state_handler(VM #define VMSTOP_SAVEVM 6 #define VMSTOP_LOADVM 7 #define VMSTOP_MIGRATE 8 +#define VMSTOP_BLOCKCOPY 9 void vm_start(void); void vm_stop(int reason);
Support live image copy + switch. That is, copy an image backing a guest hard disk to a destination image (destination image must be created separately), and switch to this copy. Command syntax: block_copy device filename [commit_filename] [-i] -- live block copy device to image optional commit filename -i for incremental copy (base image shared between src and destination) Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>