@@ -3190,6 +3190,8 @@ static int postcopy_start(MigrationState *ms)
MIGRATION_STATUS_FAILED);
}
+ trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
+
return ret;
fail_closefb:
@@ -68,7 +68,7 @@ typedef struct {
struct MigrationIncomingState {
QEMUFile *from_src_file;
/* Previously received RAM's RAMBlock pointer */
- RAMBlock *last_recv_block;
+ RAMBlock *last_recv_block[RAM_CHANNEL_MAX];
/* A hook to allow cleanup at the end of incoming migration */
void *transport_data;
void (*transport_cleanup)(void *data);
@@ -296,6 +296,20 @@ struct RAMSrcPageRequest {
QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
};
+typedef struct {
+ /*
+ * Cached ramblock/offset values if preempted. They're only meaningful if
+ * preempted==true below.
+ */
+ RAMBlock *ram_block;
+ unsigned long ram_page;
+ /*
+ * Whether a postcopy preemption just happened. Will be reset after
+ * precopy recovered to background migration.
+ */
+ bool preempted;
+} PostcopyPreemptState;
+
/* State of RAM for migration */
struct RAMState {
/* QEMUFile used for this migration */
@@ -350,6 +364,14 @@ struct RAMState {
/* Queue of outstanding page requests from the destination */
QemuMutex src_page_req_mutex;
QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
+
+ /* Postcopy preemption informations */
+ PostcopyPreemptState postcopy_preempt_state;
+ /*
+ * Current channel we're using on src VM. Only valid if postcopy-preempt
+ * is enabled.
+ */
+ unsigned int postcopy_channel;
};
typedef struct RAMState RAMState;
@@ -357,6 +379,11 @@ static RAMState *ram_state;
static NotifierWithReturnList precopy_notifier_list;
+static void postcopy_preempt_reset(RAMState *rs)
+{
+ memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
+}
+
/* Whether postcopy has queued requests? */
static bool postcopy_has_request(RAMState *rs)
{
@@ -1947,6 +1974,55 @@ void ram_write_tracking_stop(void)
}
#endif /* defined(__linux__) */
+/*
+ * Check whether two addr/offset of the ramblock falls onto the same host huge
+ * page. Returns true if so, false otherwise.
+ */
+static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
+ uint64_t addr2)
+{
+ size_t page_size = qemu_ram_pagesize(rb);
+
+ addr1 = ROUND_DOWN(addr1, page_size);
+ addr2 = ROUND_DOWN(addr2, page_size);
+
+ return addr1 == addr2;
+}
+
+/*
+ * Whether a previous preempted precopy huge page contains current requested
+ * page? Returns true if so, false otherwise.
+ *
+ * This should really happen very rarely, because it means when we were sending
+ * during background migration for postcopy we're sending exactly the page that
+ * some vcpu got faulted on on dest node. When it happens, we probably don't
+ * need to do much but drop the request, because we know right after we restore
+ * the precopy stream it'll be serviced. It'll slightly affect the order of
+ * postcopy requests to be serviced (e.g. it'll be the same as we move current
+ * request to the end of the queue) but it shouldn't be a big deal. The most
+ * imporant thing is we can _never_ try to send a partial-sent huge page on the
+ * POSTCOPY channel again, otherwise that huge page will got "split brain" on
+ * two channels (PRECOPY, POSTCOPY).
+ */
+static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
+ ram_addr_t offset)
+{
+ PostcopyPreemptState *state = &rs->postcopy_preempt_state;
+
+ /* No preemption at all? */
+ if (!state->preempted) {
+ return false;
+ }
+
+ /* Not even the same ramblock? */
+ if (state->ram_block != block) {
+ return false;
+ }
+
+ return offset_on_same_huge_page(block, offset,
+ state->ram_page << TARGET_PAGE_BITS);
+}
+
/**
* get_queued_page: unqueue a page from the postcopy requests
*
@@ -1962,9 +2038,17 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
RAMBlock *block;
ram_addr_t offset;
+again:
block = unqueue_page(rs, &offset);
- if (!block) {
+ if (block) {
+ /* See comment above postcopy_preempted_contains() */
+ if (postcopy_preempted_contains(rs, block, offset)) {
+ trace_postcopy_preempt_hit(block->idstr, offset);
+ /* This request is dropped */
+ goto again;
+ }
+ } else {
/*
* Poll write faults too if background snapshot is enabled; that's
* when we have vcpus got blocked by the write protected pages.
@@ -2180,6 +2264,117 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
return ram_save_page(rs, pss);
}
+static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
+{
+ /* Not enabled eager preempt? Then never do that. */
+ if (!migrate_postcopy_preempt()) {
+ return false;
+ }
+
+ /* If the ramblock we're sending is a small page? Never bother. */
+ if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
+ return false;
+ }
+
+ /* Not in postcopy at all? */
+ if (!migration_in_postcopy()) {
+ return false;
+ }
+
+ /*
+ * If we're already handling a postcopy request, don't preempt as this page
+ * has got the same high priority.
+ */
+ if (pss->postcopy_requested) {
+ return false;
+ }
+
+ /* If there's postcopy requests, then check it up! */
+ return postcopy_has_request(rs);
+}
+
+/* Returns true if we preempted precopy, false otherwise */
+static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
+{
+ PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
+
+ trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
+
+ /*
+ * Time to preempt precopy. Cache current PSS into preempt state, so that
+ * after handling the postcopy pages we can recover to it. We need to do
+ * so because the dest VM will have partial of the precopy huge page kept
+ * over in its tmp huge page caches; better move on with it when we can.
+ */
+ p_state->ram_block = pss->block;
+ p_state->ram_page = pss->page;
+ p_state->preempted = true;
+}
+
+/* Whether we're preempted by a postcopy request during sending a huge page */
+static bool postcopy_preempt_triggered(RAMState *rs)
+{
+ return rs->postcopy_preempt_state.preempted;
+}
+
+static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss)
+{
+ PostcopyPreemptState *state = &rs->postcopy_preempt_state;
+
+ assert(state->preempted);
+
+ pss->block = state->ram_block;
+ pss->page = state->ram_page;
+ /* This is not a postcopy request but restoring previous precopy */
+ pss->postcopy_requested = false;
+
+ trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
+
+ /* Reset preempt state, most importantly, set preempted==false */
+ postcopy_preempt_reset(rs);
+}
+
+static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
+{
+ MigrationState *s = migrate_get_current();
+ unsigned int channel;
+ QEMUFile *next;
+
+ channel = pss->postcopy_requested ?
+ RAM_CHANNEL_POSTCOPY : RAM_CHANNEL_PRECOPY;
+
+ if (channel != rs->postcopy_channel) {
+ if (channel == RAM_CHANNEL_PRECOPY) {
+ next = s->to_dst_file;
+ } else {
+ next = s->postcopy_qemufile_src;
+ }
+ /* Update and cache the current channel */
+ rs->f = next;
+ rs->postcopy_channel = channel;
+
+ /*
+ * If channel switched, reset last_sent_block since the old sent block
+ * may not be on the same channel.
+ */
+ rs->last_sent_block = NULL;
+
+ trace_postcopy_preempt_switch_channel(channel);
+ }
+
+ trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
+}
+
+/* We need to make sure rs->f always points to the default channel elsewhere */
+static void postcopy_preempt_reset_channel(RAMState *rs)
+{
+ if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+ rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
+ rs->f = migrate_get_current()->to_dst_file;
+ trace_postcopy_preempt_reset_channel();
+ }
+}
+
/**
* ram_save_host_page: save a whole host page
*
@@ -2211,7 +2406,16 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
return 0;
}
+ if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+ postcopy_preempt_choose_channel(rs, pss);
+ }
+
do {
+ if (postcopy_needs_preempt(rs, pss)) {
+ postcopy_do_preempt(rs, pss);
+ break;
+ }
+
/* Check the pages is dirty and if it is send it */
if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
tmppages = ram_save_target_page(rs, pss);
@@ -2235,6 +2439,19 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
/* The offset we leave with is the min boundary of host page and block */
pss->page = MIN(pss->page, hostpage_boundary);
+ /*
+ * When with postcopy preempt mode, flush the data as soon as possible for
+ * postcopy requests, because we've already sent a whole huge page, so the
+ * dst node should already have enough resource to atomically filling in
+ * the current missing page.
+ *
+ * More importantly, when using separate postcopy channel, we must do
+ * explicit flush or it won't flush until the buffer is full.
+ */
+ if (migrate_postcopy_preempt() && pss->postcopy_requested) {
+ qemu_fflush(rs->f);
+ }
+
res = ram_save_release_protection(rs, pss, start_page);
return (res < 0 ? res : pages);
}
@@ -2276,8 +2493,17 @@ static int ram_find_and_save_block(RAMState *rs)
found = get_queued_page(rs, &pss);
if (!found) {
- /* priority queue empty, so just search for something dirty */
- found = find_dirty_block(rs, &pss, &again);
+ /*
+ * Recover previous precopy ramblock/offset if postcopy has
+ * preempted precopy. Otherwise find the next dirty bit.
+ */
+ if (postcopy_preempt_triggered(rs)) {
+ postcopy_preempt_restore(rs, &pss);
+ found = true;
+ } else {
+ /* priority queue empty, so just search for something dirty */
+ found = find_dirty_block(rs, &pss, &again);
+ }
}
if (found) {
@@ -2405,6 +2631,8 @@ static void ram_state_reset(RAMState *rs)
rs->last_page = 0;
rs->last_version = ram_list.version;
rs->xbzrle_enabled = false;
+ postcopy_preempt_reset(rs);
+ rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
}
#define MAX_WAIT 50 /* ms, half buffered_file limit */
@@ -3048,6 +3276,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
}
qemu_mutex_unlock(&rs->bitmap_mutex);
+ postcopy_preempt_reset_channel(rs);
+
/*
* Must occur before EOS (or any QEMUFile operation)
* because of RDMA protocol.
@@ -3125,6 +3355,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
return ret;
}
+ postcopy_preempt_reset_channel(rs);
+
ret = multifd_send_sync_main(rs->f);
if (ret < 0) {
return ret;
@@ -3209,11 +3441,13 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
* @mis: the migration incoming state pointer
* @f: QEMUFile where to read the data from
* @flags: Page flags (mostly to see if it's a continuation of previous block)
+ * @channel: the channel we're using
*/
static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
- QEMUFile *f, int flags)
+ QEMUFile *f, int flags,
+ int channel)
{
- RAMBlock *block = mis->last_recv_block;
+ RAMBlock *block = mis->last_recv_block[channel];
char id[256];
uint8_t len;
@@ -3240,7 +3474,7 @@ static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
return NULL;
}
- mis->last_recv_block = block;
+ mis->last_recv_block[channel] = block;
return block;
}
@@ -3694,7 +3928,7 @@ int ram_load_postcopy(QEMUFile *f, int channel)
trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
RAM_SAVE_FLAG_COMPRESS_PAGE)) {
- block = ram_block_from_stream(mis, f, flags);
+ block = ram_block_from_stream(mis, f, flags, channel);
if (!block) {
ret = -EINVAL;
break;
@@ -3945,7 +4179,8 @@ static int ram_load_precopy(QEMUFile *f)
if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
- RAMBlock *block = ram_block_from_stream(mis, f, flags);
+ RAMBlock *block = ram_block_from_stream(mis, f, flags,
+ RAM_CHANNEL_PRECOPY);
host = host_from_ram_block_offset(block, addr);
/*
@@ -111,6 +111,12 @@ ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRI
ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
unqueue_page(char *block, uint64_t offset, bool dirty) "ramblock '%s' offset 0x%"PRIx64" dirty %d"
+postcopy_preempt_triggered(char *str, unsigned long page) "during sending ramblock %s offset 0x%lx"
+postcopy_preempt_restored(char *str, unsigned long page) "ramblock %s offset 0x%lx"
+postcopy_preempt_hit(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
+postcopy_preempt_send_host_page(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
+postcopy_preempt_switch_channel(int channel) "%d"
+postcopy_preempt_reset_channel(void) ""
# multifd.c
multifd_new_send_channel_async(uint8_t id) "channel %u"
@@ -176,6 +182,7 @@ migration_thread_low_pending(uint64_t pending) "%" PRIu64
migrate_transferred(uint64_t tranferred, uint64_t time_spent, uint64_t bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " max_size %" PRId64
process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
process_incoming_migration_co_postcopy_end_main(void) ""
+postcopy_preempt_enabled(bool value) "%d"
# channel.c
migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"