diff mbox

[25/46] Postcopy: Maintain sentmap during postcopy pre phase

Message ID 1404495717-4239-26-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert July 4, 2014, 5:41 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Where postcopy is preceeded by a period of precopy, the destination will
have received pages that may have been dirtied on the source after the
page was sent.  The destination must throw these pages away before
starting it's CPUs.

Maintain a 'sentmap' of pages that have already been sent.
Calculate list of sent & dirty pages
Provide helpers on the destination side to discard these.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 arch_init.c                      | 162 ++++++++++++++++++++++++++++++++++++++-
 include/migration/migration.h    |   5 ++
 include/migration/postcopy-ram.h |  20 +++++
 migration.c                      |   2 +
 postcopy-ram.c                   | 156 +++++++++++++++++++++++++++++++++++++
 savevm.c                         |   3 -
 6 files changed, 342 insertions(+), 6 deletions(-)
diff mbox

Patch

diff --git a/arch_init.c b/arch_init.c
index aeeaf37..134ea7e 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -40,6 +40,7 @@ 
 #include "hw/audio/audio.h"
 #include "sysemu/kvm.h"
 #include "migration/migration.h"
+#include "migration/postcopy-ram.h"
 #include "hw/i386/smbios.h"
 #include "exec/address-spaces.h"
 #include "hw/audio/pcspk.h"
@@ -413,9 +414,15 @@  static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
     return bytes_sent;
 }
 
+/* mr: The region to search for dirty pages in
+ * start: Start address (typically so we can continue from previous page)
+ * bitoffset: Pointer into which to store the offset into the dirty map
+ *            at which the bit was found.
+ */
 static inline
 ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
-                                                 ram_addr_t start)
+                                                 ram_addr_t start,
+                                                 unsigned long *bitoffset)
 {
     unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS;
     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
@@ -434,6 +441,7 @@  ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
         clear_bit(next, migration_bitmap);
         migration_dirty_pages--;
     }
+    *bitoffset = next;
     return (next - base) << TARGET_PAGE_BITS;
 }
 
@@ -562,6 +570,19 @@  static void migration_bitmap_sync(void)
     }
 }
 
+static RAMBlock *ram_find_block(const char *id)
+{
+    RAMBlock *block;
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (!strcmp(id, block->idstr)) {
+            return block;
+        }
+    }
+
+    return NULL;
+}
+
 /*
  * ram_save_page: Send the given page to the stream
  *
@@ -650,13 +671,14 @@  static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
     bool complete_round = false;
     int bytes_sent = 0;
     MemoryRegion *mr;
+    unsigned long bitoffset;
 
     if (!block)
         block = QTAILQ_FIRST(&ram_list.blocks);
 
     while (true) {
         mr = block->mr;
-        offset = migration_bitmap_find_and_reset_dirty(mr, offset);
+        offset = migration_bitmap_find_and_reset_dirty(mr, offset, &bitoffset);
         if (complete_round && block == last_seen_block &&
             offset >= last_offset) {
             break;
@@ -674,6 +696,11 @@  static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
 
             /* if page is unmodified, continue to the next */
             if (bytes_sent > 0) {
+                MigrationState *s = migrate_get_current();
+                if (s->sentmap) {
+                    set_bit(bitoffset, s->sentmap);
+                }
+
                 last_sent_block = block;
                 break;
             }
@@ -733,12 +760,19 @@  void free_xbzrle_decoded_buf(void)
 
 static void migration_end(void)
 {
+    MigrationState *s = migrate_get_current();
+
     if (migration_bitmap) {
         memory_global_dirty_log_stop();
         g_free(migration_bitmap);
         migration_bitmap = NULL;
     }
 
+    if (s->sentmap) {
+        g_free(s->sentmap);
+        s->sentmap = NULL;
+    }
+
     XBZRLE_cache_lock();
     if (XBZRLE.cache) {
         cache_fini(XBZRLE.cache);
@@ -806,6 +840,123 @@  void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
     }
 }
 
+/*
+ * Utility for the outgoing postcopy code; this performs
+ * sentmap &= migration_bitmap
+ * returning the length of the bitmap
+ */
+int64_t ram_mask_postcopy_bitmap(MigrationState *ms)
+{
+    int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+    migration_bitmap_sync();
+    bitmap_and(ms->sentmap, ms->sentmap, migration_bitmap, ram_pages);
+    return ram_pages;
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ *   Calls postcopy_send_discard_bm_ram for each RAMBlock
+ *   passing it bitmap indexes and name.
+ * Returns: 0 on success
+ * (qemu_ram_foreach_block ends up passing unscaled lengths
+ *  which would mean postcopy code would have to deal with target page)
+ */
+int ram_postcopy_each_ram_discard(MigrationState *ms)
+{
+    struct RAMBlock *block;
+    int ret;
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        /*
+         * Postcopy sends chunks of bitmap over the wire, but it
+         * just needs indexes at this point, avoids it having
+         * target page specific code.
+         */
+        unsigned long first, last;
+        first = block->offset >> TARGET_PAGE_BITS;
+        last = (block->offset + (block->length-1)) >> TARGET_PAGE_BITS;
+        ret = postcopy_send_discard_bm_ram(ms, block->idstr, first, last);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * At the start of the postcopy phase of migration, any now-dirty
+ * precopied pages are discarded.
+ *
+ * start..end is an inclusive range of bits indexed in the source
+ *    VMs bitmap for this RAMBlock, source_target_page_bits tells
+ *    us what one of those bits represents.
+ *
+ * start/end are offsets from the start of the bitmap for RAMBlock 'block_name'
+ *
+ * Returns 0 on success.
+ */
+int ram_discard_range(MigrationIncomingState *mis,
+                      const char *block_name,
+                      int source_target_page_bits,
+                      uint64_t start, uint64_t end)
+{
+    assert(end >= start);
+    unsigned int bitdif;
+
+    RAMBlock *rb = ram_find_block(block_name);
+
+    if (!rb) {
+        error_report("ram_discard_range: Failed to find block '%s'",
+                     block_name);
+        return -1;
+    }
+
+    if (source_target_page_bits != TARGET_PAGE_BITS) {
+        if (source_target_page_bits < TARGET_PAGE_BITS) {
+            /*
+             * e.g. source is 4K and we're 64k - we'll have to discard
+             * on the larger boundary
+             * e.g. a range of  70K...132K we would discard from
+             * 64K..192K, so round start down, and end up
+             */
+            bitdif = TARGET_PAGE_BITS - source_target_page_bits;
+            start = start >> bitdif;
+            if (end & ((1<<bitdif)-1)) {
+                end = end >> bitdif;
+                end++;
+            } else {
+                end = end >> bitdif;
+            }
+
+        } else {
+            /* e.g. source is 64K and we're 4K - easy just scale the indexes */
+            bitdif = source_target_page_bits - TARGET_PAGE_BITS;
+
+            start = start << bitdif;
+            end = end << bitdif;
+        }
+    }
+
+    uint64_t index_offset = rb->offset >> TARGET_PAGE_BITS;
+    postcopy_pmi_discard_range(mis, start + index_offset, (end - start) + 1);
+
+    /* +1 gives the byte after the end of the last page to be discarded */
+    ram_addr_t end_offset = (end+1) << TARGET_PAGE_BITS;
+    uint8_t *host_startaddr = rb->host + (start << TARGET_PAGE_BITS);
+    uint8_t *host_endaddr;
+
+    if (end_offset <= rb->length) {
+        host_endaddr   = rb->host + (end_offset-1);
+        return postcopy_ram_discard_range(mis, host_startaddr, host_endaddr);
+    } else {
+        error_report("ram_discard_range: Overrun block '%s' (%zu/%zu/%zu)",
+                     block_name, start, end, rb->length);
+        return -1;
+    }
+}
+
 static int ram_save_setup(QEMUFile *f, void *opaque)
 {
     RAMBlock *block;
@@ -844,7 +995,6 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
 
         acct_clear();
     }
-
     qemu_mutex_lock_iothread();
     qemu_mutex_lock_ramlist();
     bytes_transferred = 0;
@@ -854,6 +1004,12 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
     migration_bitmap = bitmap_new(ram_bitmap_pages);
     bitmap_set(migration_bitmap, 0, ram_bitmap_pages);
 
+    if (migrate_postcopy_ram()) {
+        MigrationState *s = migrate_get_current();
+        s->sentmap = bitmap_new(ram_bitmap_pages);
+        bitmap_clear(s->sentmap, 0, ram_bitmap_pages);
+    }
+
     /*
      * Count the total number of pages used by ram blocks not including any
      * gaps due to alignment or unplugs.
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 71442d8..2289254 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -171,6 +171,11 @@  double xbzrle_mig_cache_miss_rate(void);
 
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
 void ram_debug_dump_bitmap(unsigned long *todump, bool expected);
+int64_t ram_mask_postcopy_bitmap(MigrationState *ms);
+int ram_postcopy_each_ram_discard(MigrationState *ms);
+int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
+                      int source_target_page_bits,
+                      uint64_t start, uint64_t end);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
index dcd1afa..fe89a3c 100644
--- a/include/migration/postcopy-ram.h
+++ b/include/migration/postcopy-ram.h
@@ -13,7 +13,27 @@ 
 #ifndef QEMU_POSTCOPY_RAM_H
 #define QEMU_POSTCOPY_RAM_H
 
+#include "migration/migration.h"
+
 /* Return 0 if the host supports everything we need to do postcopy-ram */
 int postcopy_ram_hosttest(void);
 
+/* Send the list of sent-but-dirty pages */
+int postcopy_send_discard_bitmap(MigrationState *ms);
+
+/*
+ * Discard the contents of memory start..end inclusive.
+ * We can assume that if we've been called postcopy_ram_hosttest returned true
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               uint8_t *end);
+
+
+/*
+ * Called back from arch_init's ram_postcopy_each_ram_discard to handle
+ * discarding one RAMBlock's pre-postcopy dirty pages
+ */
+int postcopy_send_discard_bm_ram(MigrationState *ms, const char *name,
+                                 unsigned long start, unsigned long end);
+
 #endif
diff --git a/migration.c b/migration.c
index d9a9e5b..ca0fd7b 100644
--- a/migration.c
+++ b/migration.c
@@ -22,6 +22,7 @@ 
 #include "block/block.h"
 #include "qemu/sockets.h"
 #include "migration/block.h"
+#include "migration/postcopy-ram.h"
 #include "qemu/thread.h"
 #include "qmp-commands.h"
 #include "trace.h"
@@ -928,6 +929,7 @@  static void *migration_thread(void *opaque)
             } else {
                 int ret;
 
+                DPRINTF("done iterating\n");
                 qemu_mutex_lock_iothread();
                 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
                 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
diff --git a/postcopy-ram.c b/postcopy-ram.c
index 1f3e6ea..ff6bdd6 100644
--- a/postcopy-ram.c
+++ b/postcopy-ram.c
@@ -23,6 +23,7 @@ 
 #include "qemu-common.h"
 #include "migration/migration.h"
 #include "migration/postcopy-ram.h"
+#include "sysemu/sysemu.h"
 
 //#define DEBUG_POSTCOPY
 
@@ -116,6 +117,21 @@  int postcopy_ram_hosttest(void)
     return 0;
 }
 
+/*
+ * Discard the contents of memory start..end inclusive.
+ * We can assume that if we've been called postcopy_ram_hosttest returned true
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               uint8_t *end)
+{
+    if (madvise(start, (end-start)+1, MADV_DONTNEED)) {
+        perror("postcopy_ram_discard_range MADV_DONTNEED");
+        return -1;
+    }
+
+    return 0;
+}
+
 #else
 /* No target OS support, stubs just fail */
 
@@ -125,5 +141,145 @@  int postcopy_ram_hosttest(void)
     return -1;
 }
 
+int postcopy_ram_discard_range(MigrationIncomingState *mis, void *start,
+                               void *end)
+{
+    error_report("postcopy_ram_discard_range: No OS support");
+    return -1;
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+/*
+ * A helper to get 64 bits from the sentmap; trivial for HOST_LONG_BITS=64
+ * messier for other sizes; pads with 0's at end if an unaligned end
+ *   check2nd32: True if it's safe to read the upper 32bits in a 32bit long
+ *               map
+ */
+static uint64_t get_64bits_sentmap(unsigned long *sentmap, bool check2nd32,
+                                   int64_t start)
+{
+    uint64_t result;
+#if HOST_LONG_BITS == 64
+    result = sentmap[start / 64];
+#elif HOST_LONG_BITS == 32
+    /*
+     * Irrespective of host endianness, sentmap[n] is for pages earlier
+     * than sentmap[n+1] so we can't just cast up
+     */
+    uint32_t sm0, sm1;
+    sm0 = sentmap[start / 32];
+    sm1 = check2nd32 ? sentmap[(start / 32) + 1] : 0;
+    result = sm0 | ((uint64_t)sm1) << 32;
+#else
+#error "Host long other than 64/32 not supported"
+#endif
+
+    return result;
+}
+
+/*
+ * Callback from ram_postcopy_each_ram_discard for each RAMBlock
+ * start,end: Indexes into the bitmap for the first and last bit
+ *            representing the named block
+ */
+int postcopy_send_discard_bm_ram(MigrationState *ms, const char *name,
+                                 unsigned long start, unsigned long end)
+{
+    /* Keeps command under 256 bytes - but arbitrary */
+    const unsigned int max_entries_per_command = 12;
+    uint16_t cur_entry;
+    uint64_t buffer[2*max_entries_per_command];
+    unsigned int nsentwords = 0;
+    unsigned int nsentcmds = 0;
+
+    /*
+     * There is no guarantee that start, end are on convenient 64bit multiples
+     * (We always send 64bit chunks over the wire, irrespective of long size)
+     */
+    unsigned long first64, last64, cur64;
+    first64 = start / 64;
+    last64 = end / 64;
+
+    cur_entry = 0;
+    for (cur64 = first64; cur64 <= last64; cur64++) {
+        /* Deal with start/end not on alignment */
+        uint64_t mask;
+        mask = ~(uint64_t)0;
+
+        if ((cur64 == first64) && (start & 63)) {
+            /* e.g. (start & 63) = 3
+             *         1 << .    -> 2^3
+             *         . - 1     -> 2^3 - 1 i.e. mask 2..0
+             *         ~.        -> mask 63..3
+             */
+            mask &= ~((((uint64_t)1) << (start & 63)) - 1);
+        }
+
+        if ((cur64 == last64) && ((end & 64) != 63)) {
+            /* e.g. (end & 64) = 3
+             *            .   +1 -> 4
+             *         1 << .    -> 2^4
+             *         . -1      -> 2^4 - 1
+             *                   = mask set 3..0
+             */
+            mask &= (((uint64_t)1) << ((end & 64) + 1)) - 1;
+        }
+
+        uint64_t data = get_64bits_sentmap(ms->sentmap,
+                                           (end & 64) >= 32, cur64 * 64);
+        data &= mask;
+
+        if (data) {
+            cpu_to_be64w(buffer+2*cur_entry, (cur64-first64));
+            cpu_to_be64w(buffer+1+2*cur_entry, data);
+            cur_entry++;
+            nsentwords++;
+
+            if (cur_entry == max_entries_per_command) {
+                /* Full set, ship it! */
+                qemu_savevm_send_postcopy_ram_discard(ms->file, name,
+                                                      cur_entry,
+                                                      start & 63,
+                                                      buffer);
+                nsentcmds++;
+                cur_entry = 0;
+            }
+        }
+    }
+
+    /* Anything unsent? */
+    if (cur_entry) {
+        qemu_savevm_send_postcopy_ram_discard(ms->file, name, cur_entry,
+                                              start & 63, buffer);
+        nsentcmds++;
+    }
+
+    /*fprintf(stderr, "postcopy_send_discard_bm_ram: '%s' mask words"
+                      " sent=%d in %d commands.\n",
+            name, nsentwords, nsentcmds);*/
+
+    return 0;
+}
+
+/*
+ * Transmit the set of pages to be discarded after precopy to the target
+ * these are pages that have been sent previously but have been dirtied
+ * Hopefully this is pretty sparse
+ */
+int postcopy_send_discard_bitmap(MigrationState *ms)
+{
+    /*
+     * Update the sentmap to be  sentmap&=dirty
+     * (arch_init gives us the full size as a return)
+     */
+    ram_mask_postcopy_bitmap(ms);
+
+    DPRINTF("Dumping merged sentmap");
+#ifdef DEBUG_POSTCOPY
+    ram_debug_dump_bitmap(ms->sentmap, false);
 #endif
 
+    return ram_postcopy_each_ram_discard(ms);
+}
+
diff --git a/savevm.c b/savevm.c
index a2c5fc8..1d5375c 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1238,12 +1238,9 @@  static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
              * we know there must be at least 1 bit set due to the loop entry
              * If there is no 0 firstzero will be 64
              */
-            /* TODO - ram_discard_range gets added in a later patch
             int ret = ram_discard_range(mis, ramid, source_target_page_bits,
                                 startaddr + firstset - first_bit_offset,
                                 startaddr + (firstzero - 1) - first_bit_offset);
-             */
-            ret = -1; /* TODO */
             if (ret) {
                 return ret;
             }