[RFC,v1:,03/12] migration: introduce parallelization of migration_bitmap

Message ID	1382318062-6288-4-git-send-email-mrhines@linux.vnet.ibm.com
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> Gateway: Authorized Use Only! Violators will be prosecuted for <qemu-devel@nongnu.org> from <mrhines@linux.vnet.ibm.com>; Sun, 20 Oct 2013 21:14:42 -0400 Gateway: Authorized Use Only! Violators will be prosecuted; Sun, 20 Oct 2013 21:14:41 -0400 From: mrhines@linux.vnet.ibm.com To: qemu-devel@nongnu.org Date: Mon, 21 Oct 2013 01:14:13 +0000 Message-Id: <1382318062-6288-4-git-send-email-mrhines@linux.vnet.ibm.com> In-Reply-To: <1382318062-6288-1-git-send-email-mrhines@linux.vnet.ibm.com> References: <1382318062-6288-1-git-send-email-mrhines@linux.vnet.ibm.com> Cc: aliguori@us.ibm.com, quintela@redhat.com, owasserm@redhat.com, onom@us.ibm.com, abali@us.ibm.com, mrhines@us.ibm.com, gokul@us.ibm.com, pbonzini@redhat.com Subject: [Qemu-devel] [RFC PATCH v1: 03/12] migration: introduce parallelization of migration_bitmap Precedence: list Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

diff --git a/arch_init.c b/arch_init.c index 7545d96..4a71311 100644 --- a/arch_init.c +++ b/arch_init.c @@ -189,6 +189,8 @@ typedef struct AccountingInfo { uint64_t skipped_pages; uint64_t norm_pages; uint64_t iterations; + uint64_t log_dirty_time; + uint64_t migration_bitmap_time; uint64_t xbzrle_bytes; uint64_t xbzrle_pages; uint64_t xbzrle_cache_miss; @@ -232,6 +234,16 @@ uint64_t norm_mig_pages_transferred(void) return acct_info.norm_pages; } +uint64_t norm_mig_log_dirty_time(void) +{ + return acct_info.log_dirty_time; +} + +uint64_t norm_mig_bitmap_time(void) +{ + return acct_info.migration_bitmap_time; +} + uint64_t xbzrle_mig_bytes_transferred(void) { return acct_info.xbzrle_bytes; @@ -362,15 +374,189 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, static inline bool migration_bitmap_set_dirty(MemoryRegion *mr, ram_addr_t offset) { - bool ret; - int nr = (mr->ram_addr + offset) >> TARGET_PAGE_BITS; + return test_and_set_bit((mr->ram_addr + offset) >> TARGET_PAGE_BITS, + migration_bitmap); +} + +typedef struct BitmapWalkerParams { + QemuMutex ready_mutex; + QemuMutex done_mutex; + QemuCond cond; + QemuThread walker; + MigrationState *s; + int core_id; + int keep_running; + ram_addr_t start; + ram_addr_t stop; + void *block; + uint64_t dirty_pages; +} BitmapWalkerParams; - ret = test_and_set_bit(nr, migration_bitmap); +static int nb_bitmap_workers = 0; - if (!ret) { - migration_dirty_pages++; +BitmapWalkerParams *bitmap_walkers = NULL; + +/* + * Bitmap workers: This is a temporary performance-driven + * workaround for the slowness (10s of milliseconds) incurred + * during calls to migration_bitmap_sync(). + * + * Ideally, migration_bitmap_sync() should be able to use the + * GET_LOG_DIRTY bitmap from KVM directly, but it does not right + * now because the bitmap is not retrieved as a single memory + * allocation which requires a couple of transformations into + * a 'unified' bitmap before the migration code can make good use + * of it. + * + * Bitmap workers perform this transformation in parallel + * in a multi-threaded fashion until a patch is ready to process + * the bitmaps from GET_LOG_DIRTY directly. + */ +static uint64_t migration_bitmap_sync_range(RAMBlock *block, + ram_addr_t start, ram_addr_t stop) +{ + ram_addr_t addr; + uint64_t dirty_pages = 0; + + + for (addr = start; addr < stop; addr += TARGET_PAGE_SIZE) { + if (memory_region_test_and_clear_dirty(block->mr, + addr, TARGET_PAGE_SIZE, + DIRTY_MEMORY_MIGRATION)) { + if (!migration_bitmap_set_dirty(block->mr, addr)) { + dirty_pages++; + } + } + } + + return dirty_pages; +} + +/* + * The worker sleeps until it gets some work to transform a + * chunk of bitmap from KVM to the migration_bitmap. + */ +void *migration_bitmap_worker(void *opaque) +{ + BitmapWalkerParams * bwp = opaque; + + do { + qemu_mutex_lock(&bwp->ready_mutex); + qemu_mutex_lock(&bwp->done_mutex); + qemu_mutex_unlock(&bwp->ready_mutex); + qemu_cond_signal(&bwp->cond); + + if(!bwp->keep_running) { + break; + } + + bwp->dirty_pages = migration_bitmap_sync_range(bwp->block, bwp->start, bwp->stop); + + qemu_cond_wait(&bwp->cond, &bwp->done_mutex); + qemu_mutex_unlock(&bwp->done_mutex); + } while(bwp->keep_running); + + return NULL; +} + +void migration_bitmap_worker_start(MigrationState *s) +{ + int core; + + /* + * CPUs N - 1 are reserved for N - 1 worker threads + * processing the pc.ram bytemap => migration_bitmap. + * The migration thread goes on the last CPU, + * which process the remaining, smaller RAMblocks. + */ + nb_bitmap_workers = getNumCores() - 1; + + bitmap_walkers = g_malloc0(sizeof(struct BitmapWalkerParams) * + nb_bitmap_workers); + + memset(bitmap_walkers, 0, sizeof(BitmapWalkerParams) * nb_bitmap_workers); + + for (core = 0; core < nb_bitmap_workers; core++) { + BitmapWalkerParams * bwp = &bitmap_walkers[core]; + bwp->core_id = core; + bwp->keep_running = 1; + bwp->s = s; + qemu_cond_init(&bwp->cond); + qemu_mutex_init(&bwp->ready_mutex); + qemu_mutex_init(&bwp->done_mutex); + qemu_mutex_lock(&bwp->ready_mutex); + } + + for (core = 0; core < nb_bitmap_workers; core++) { + BitmapWalkerParams * bwp = &bitmap_walkers[core]; + qemu_thread_create(&bwp->walker, + migration_bitmap_worker, bwp, QEMU_THREAD_DETACHED); + } +} + +void migration_bitmap_worker_stop(MigrationState *s) +{ + int core; + + for (core = 0; core < nb_bitmap_workers; core++) { + BitmapWalkerParams * bwp = &bitmap_walkers[core]; + bwp->keep_running = 0; + qemu_mutex_unlock(&bwp->ready_mutex); + } + + DPRINTF("Bitmap workers stopped.\n"); + + g_free(bitmap_walkers); + bitmap_walkers = NULL; + nb_bitmap_workers = 0; +} + + +static void migration_bitmap_distribute_specific_worker(MigrationState *s, RAMBlock * block, int core, ram_addr_t start, ram_addr_t stop) +{ + BitmapWalkerParams * bwp = &bitmap_walkers[core]; + + bwp->start = start; + bwp->stop = stop; + bwp->block = block; + + qemu_cond_wait(&bwp->cond, &bwp->ready_mutex); +} + +static void migration_bitmap_join_worker(MigrationState *s, int core) +{ + BitmapWalkerParams * bwp = &bitmap_walkers[core]; + qemu_mutex_lock(&bwp->done_mutex); + qemu_cond_signal(&bwp->cond); + qemu_mutex_unlock(&bwp->done_mutex); + migration_dirty_pages += bwp->dirty_pages; +} + +/* + * Chop up the QEMU 'bytemap' built around GET_LOG_DIRTY and handout + * the migration_bitmap population work to all the workers. + * + * If there are N cpus in the hypervisor, there will be N workers + * which each process equal chunks of the RAM block bytemap. + */ +static void migration_bitmap_distribute_work(MigrationState *s, RAMBlock * block) +{ + uint64_t pages = block->length / TARGET_PAGE_SIZE; + uint64_t inc = pages / nb_bitmap_workers; + uint64_t remainder = pages % inc; + int core; + + for (core = 0; core < nb_bitmap_workers; core++) { + ram_addr_t start = core * inc, stop = core * inc + inc; + + if(core == (nb_bitmap_workers - 1)) + stop += remainder; + + start *= TARGET_PAGE_SIZE; + stop *= TARGET_PAGE_SIZE; + + migration_bitmap_distribute_specific_worker(s, block, core, start, stop); } - return ret; } /* Needs iothread lock! */ @@ -378,7 +564,6 @@ static inline bool migration_bitmap_set_dirty(MemoryRegion *mr, static void migration_bitmap_sync(void) { RAMBlock *block; - ram_addr_t addr; uint64_t num_dirty_pages_init = migration_dirty_pages; MigrationState *s = migrate_get_current(); static int64_t start_time; @@ -386,33 +571,46 @@ static void migration_bitmap_sync(void) static int64_t num_dirty_pages_period; int64_t end_time; int64_t bytes_xfer_now; + int64_t begin_time; + int64_t dirty_time; if (!bytes_xfer_prev) { bytes_xfer_prev = ram_bytes_transferred(); } + begin_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); if (!start_time) { start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); } - trace_migration_bitmap_sync_start(); address_space_sync_dirty_bitmap(&address_space_memory); + dirty_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + QTAILQ_FOREACH(block, &ram_list.blocks, next) { - for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) { - if (memory_region_test_and_clear_dirty(block->mr, - addr, TARGET_PAGE_SIZE, - DIRTY_MEMORY_MIGRATION)) { - migration_bitmap_set_dirty(block->mr, addr); - } + if (!strcmp(block->idstr, "pc.ram") && nb_bitmap_workers) { + migration_bitmap_distribute_work(s, block); + continue; } + migration_dirty_pages += migration_bitmap_sync_range(block, 0, block->length); } + + if (nb_bitmap_workers) { + int core; + for (core = 0; core < nb_bitmap_workers; core++) { + migration_bitmap_join_worker(s, core); + } + } + trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - /* more than 1 second = 1000 millisecons */ + acct_info.log_dirty_time += dirty_time - begin_time; + acct_info.migration_bitmap_time += end_time - dirty_time; + + /* more than 1 second = 1000 milliseconds */ if (end_time > start_time + 1000) { if (migrate_auto_converge()) { /* The following detection logic can be refined later. For now: diff --git a/include/migration/migration.h b/include/migration/migration.h index 140e6b4..3ffc433 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -45,6 +45,10 @@ struct MigrationState int64_t total_time; int64_t downtime; int64_t expected_downtime; + int64_t xmit_time; + int64_t ram_copy_time; + int64_t log_dirty_time; + int64_t bitmap_time; int64_t dirty_pages_rate; int64_t dirty_bytes_rate; bool enabled_capabilities[MIGRATION_CAPABILITY_MAX]; @@ -109,10 +113,16 @@ uint64_t skipped_mig_bytes_transferred(void); uint64_t skipped_mig_pages_transferred(void); uint64_t norm_mig_bytes_transferred(void); uint64_t norm_mig_pages_transferred(void); +uint64_t norm_mig_log_dirty_time(void); +uint64_t norm_mig_bitmap_time(void); uint64_t xbzrle_mig_bytes_transferred(void); uint64_t xbzrle_mig_pages_transferred(void); uint64_t xbzrle_mig_pages_overflow(void); uint64_t xbzrle_mig_pages_cache_miss(void); +void *migration_bitmap_worker(void *opaque); +void migration_bitmap_worker_start(MigrationState *s); +void migration_bitmap_worker_stop(MigrationState *s); +void migrate_set_state(MigrationState *s, int old_state, int new_state); void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); diff --git a/include/qemu-common.h b/include/qemu-common.h index 5054836..936dc02 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -478,4 +478,16 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len); */ int parse_debug_env(const char *name, int max, int initial); +/* + * Headers to get number of host processors. + */ +int getNumCores(void); +#if defined(WIN32) +#include <windows.h> +#elif defined(CONFIG_BSD) +#include <sys/param.h> +#include <sys/sysctl.h> +#elif defined(CONFIG_LINUX) +#include <unistd.h> +#endif #endif diff --git a/qapi-schema.json b/qapi-schema.json index 60f3fd1..aac0894 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -577,6 +577,36 @@ 'cache-miss': 'int', 'overflow': 'int' } } ## +# @MCStats +# +# Detailed Micro Checkpointing (MC) statistics +# +# @mbps: throughput of transmitting last MC +# +# @xmit-time: milliseconds to transmit last MC +# +# @log-dirty-time: milliseconds to GET_LOG_DIRTY for last MC +# +# @migration-bitmap-time: milliseconds to prepare dirty bitmap for last MC +# +# @ram-copy-time: milliseconds to ram_save_live() last MC to staging memory +# +# @copy-mbps: throughput of ram_save_live() to staging memory for last MC +# +# @checkpoints: cummulative total number of MCs generated +# +# Since: 1.8 +## +{ 'type': 'MCStats', + 'data': {'mbps': 'number', + 'xmit-time': 'uint64', + 'log-dirty-time': 'uint64', + 'migration-bitmap-time': 'uint64', + 'ram-copy-time': 'uint64', + 'checkpoints' : 'uint64', + 'copy-mbps': 'number' }} + +## # @MigrationInfo # # Information about current migration process. @@ -622,6 +652,7 @@ 'data': {'*status': 'str', '*ram': 'MigrationStats', '*disk': 'MigrationStats', '*xbzrle-cache': 'XBZRLECacheStats', + '*mc': 'MCStats', '*total-time': 'int', '*expected-downtime': 'int', '*downtime': 'int', @@ -661,10 +692,50 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # +# @x-mc: The migration will never end, and the VM will instead be continuously +# micro-checkpointed (MC). Use the command migrate-set-mc-delay to +# control the frequency at which the checkpoints occur. +# Disabled by default. (Since 1.8) +# +# @mc-net-disable: Deactivate network buffering against outbound network +# traffic while Micro-Checkpointing (@mc) is active. +# Enabled by default. Disabling will make the MC protocol inconsistent +# and potentially break network connections upon an actual failure. +# Only for performance testing. (Since 1.8) +# +# @mc-rdma-copy: MC requires creating a local-memory checkpoint before +# transmission to the destination. This requires heavy use of +# memcpy() which dominates the processor pipeline. This option +# makes use of *local* RDMA to perform the copy instead of the CPU. +# Enabled by default only if the migration transport is RDMA. +# Disabled by default otherwise. (Since 1.8) +# +# @bitworkers: Allow the QEMU migration bitmap to be scanned in parallel +# by using multiple processors on the host machine. +# This capability has no effect without also enabling @mc. +# Enabled by default. (Since 1.8) +# +# @rdma-keepalive: RDMA connections do not timeout by themselves if a peer +# has disconnected prematurely or failed. User-level keepalives +# allow the migration to abort cleanly if there is a problem with the +# destination host. For debugging, this can be problematic as +# the keepalive may cause the peer to abort prematurely if we are +# at a GDB breakpoint, for example. +# Enabled by default. (Since 1.8) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', + 'x-rdma-pin-all', + 'auto-converge', + 'zero-blocks', + 'x-mc', + 'mc-net-disable', + 'mc-rdma-copy', + 'bitworkers', + 'rdma-keepalive' + ] } ## # @MigrationCapabilityStatus diff --git a/vl.c b/vl.c index b42ac67..e2ba2e8 100644 --- a/vl.c +++ b/vl.c @@ -2818,6 +2818,39 @@ static int object_create(QemuOpts *opts, void *opaque) return 0; } +/* + * Currently, only used for migration_bitmap_sync(), + * but can be queried by anyone in the future. + */ +int getNumCores(void) +{ + uint32_t count; +#if defined(WIN32) + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + count = sysinfo.dwNumberOfProcessors; +#elif defined(CONFIG_BSD) + int nm[2]; + size_t len = 4; + nm[0] = CTL_HW; + nm[1] = HW_AVAILCPU; + sysctl(nm, 2, &count, &len, NULL, 0); + + if (count < 1) { + nm[1] = HW_NCPU; + sysctl(nm, 2, &count, &len, NULL, 0); + if(count < 1) { + count = 1; + } + } +#elif defined(CONFIG_LINUX) + count = sysconf(_SC_NPROCESSORS_ONLN); +#else + count = 1; +#endif + return count; +} + int main(int argc, char **argv, char **envp) { int i;

[RFC,v1:,03/12] migration: introduce parallelization of migration_bitmap

Commit Message

Patch