diff mbox

[v2,35/41] postcopy: introduce helper functions for postcopy

Message ID 77d2fc4b1c99e1520a7fbe873e73119ce9463592.1338802192.git.yamahata@valinux.co.jp
State New
Headers show

Commit Message

Isaku Yamahata June 4, 2012, 9:57 a.m. UTC
This patch introduces helper function for postcopy to access
umem char device and to communicate between incoming-qemu and umemd.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
---
changes v1 -> v2:
- code simplification
- make fault trigger more robust
- introduce struct umem_pages
---
 umem.c |  364 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 umem.h |  101 ++++++++++++++++++
 2 files changed, 465 insertions(+), 0 deletions(-)
 create mode 100644 umem.c
 create mode 100644 umem.h

Comments

Juan Quintela June 14, 2012, 9:34 p.m. UTC | #1
Isaku Yamahata <yamahata@valinux.co.jp> wrote:
> +//#define DEBUG_UMEM
> +#ifdef DEBUG_UMEM
> +#include <sys/syscall.h>
> +#define DPRINTF(format, ...)                                            \
> +    do {                                                                \
> +        printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid),    \
> +               __func__, __LINE__, ## __VA_ARGS__);                     \
> +    } while (0)

This should be in a header file that is linux specific?  And (at least
on my systems) gettid is already defined on glibc.


> +#else
> +#define DPRINTF(format, ...)    do { } while (0)
> +#endif


> +
> +#define DEV_UMEM        "/dev/umem"
> +
> +UMem *umem_new(void *hostp, size_t size)
> +{
> +    struct umem_init uinit = {
> +        .size = size,
> +    };
> +    UMem *umem;
> +
> +    assert((size % getpagesize()) == 0);
> +    umem = g_new(UMem, 1);
> +    umem->fd = open(DEV_UMEM, O_RDWR);
> +    if (umem->fd < 0) {
> +        perror("can't open "DEV_UMEM);
> +        abort();

Can we return one error insntead of abort?  the same for the rest of the
file aborts.


> +size_t umem_pages_size(uint64_t nr)
> +{
> +    return sizeof(struct umem_pages) + nr * sizeof(uint64_t);

Can we make sure that the pgoffs field is aligned?  I know that as it is
now it is aligned, but better to be sure?

> +}
> +
> +static void umem_write_cmd(int fd, uint8_t cmd)
> +{
> +    DPRINTF("write cmd %c\n", cmd);
> +
> +    for (;;) {
> +        ssize_t ret = write(fd, &cmd, 1);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            } else if (errno == EPIPE) {
> +                perror("pipe");
> +                DPRINTF("write cmd %c %zd %d: pipe is closed\n",
> +                        cmd, ret, errno);
> +                break;
> +            }


Grr, we don't have a function that writes does a "safe_write".  The most
similar thing in qemu looks to be send_all().

> +
> +            perror("pipe");

Can we make a different perror() message than previous error?

> +            DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno);
> +            abort();
> +        }
> +
> +        break;
> +    }
> +}
> +
> +static void umem_read_cmd(int fd, uint8_t expect)
> +{
> +    uint8_t cmd;
> +    for (;;) {
> +        ssize_t ret = read(fd, &cmd, 1);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            }
> +            perror("pipe");
> +            DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno);
> +            abort();
> +        }
> +
> +        if (ret == 0) {
> +            DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret);
> +            abort();
> +        }
> +
> +        break;
> +    }
> +
> +    DPRINTF("read cmd %c\n", cmd);
> +    if (cmd != expect) {
> +        DPRINTF("cmd %c expect %d\n", cmd, expect);
> +        abort();

Ouch.  If we receive garbage, we just exit?

I really think that we should implement error handling.

> +    }
> +}
> +
> +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset)
> +{
> +    int ret;
> +    uint64_t nr;
> +    size_t size;
> +    struct umem_pages *pages;
> +
> +    ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset);
> +    *offset += sizeof(nr);
> +    DPRINTF("ret %d nr %ld\n", ret, nr);
> +    if (ret != sizeof(nr) || nr == 0) {
> +        return NULL;
> +    }
> +
> +    size = umem_pages_size(nr);
> +    pages = g_malloc(size);

Just thinking about this.  Couldn't we just decide on a "big enough"
buffer, and never send anything bigger than that?  That would remove the
need to have to malloc()/free() a buffer for each reception?



> +/* qemu side handler */
> +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
> +                                                int *offset)
> +{
> +    uint64_t i;
> +    int page_shift = ffs(getpagesize()) - 1;
> +    struct umem_pages *pages = umem_recv_pages(from_umemd, offset);
> +    if (pages == NULL) {
> +        return NULL;
> +    }
> +
> +    for (i = 0; i < pages->nr; i++) {
> +        ram_addr_t addr = pages->pgoffs[i] << page_shift;
> +
> +        /* make pages present by forcibly triggering page fault. */
> +        volatile uint8_t *ram = qemu_get_ram_ptr(addr);
> +        uint8_t dummy_read = ram[0];
> +        (void)dummy_read;   /* suppress unused variable warning */
> +    }
> +
> +    /*
> +     * Very Linux implementation specific.
> +     * Make it sure that other thread doesn't fault on the above virtual
> +     * address. (More exactly other thread doesn't call fault handler with
> +     * the offset.)
> +     * the fault handler is called with mmap_sem read locked.
> +     * madvise() does down/up_write(mmap_sem)
> +     */
> +    qemu_madvise(NULL, 0, MADV_NORMAL);

If it is linux specific, should be inside CONFIG_LINUX ifdef, or a
function hided on some header.

Talking about looking, what protects that no other thread enters this
function before this one calls madvise?   Or I am losing something obvious?

> +
> +struct umem_pages {
> +    uint64_t nr;
> +    uint64_t pgoffs[0];
> +};
> +

QEMU really likes typedefs for structs.

Later, Juan.
Isaku Yamahata June 16, 2012, 9:48 a.m. UTC | #2
On Thu, Jun 14, 2012 at 11:34:09PM +0200, Juan Quintela wrote:
> Isaku Yamahata <yamahata@valinux.co.jp> wrote:
> > +//#define DEBUG_UMEM
> > +#ifdef DEBUG_UMEM
> > +#include <sys/syscall.h>
> > +#define DPRINTF(format, ...)                                            \
> > +    do {                                                                \
> > +        printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid),    \
> > +               __func__, __LINE__, ## __VA_ARGS__);                     \
> > +    } while (0)
> 
> This should be in a header file that is linux specific?  And (at least
> on my systems) gettid is already defined on glibc.

I'll remove getpid/gettid. It was just for debugging in early phase.
They are not necessary any more.


> > +#else
> > +#define DPRINTF(format, ...)    do { } while (0)
> > +#endif
> 
> 
> > +
> > +#define DEV_UMEM        "/dev/umem"
> > +
> > +UMem *umem_new(void *hostp, size_t size)
> > +{
> > +    struct umem_init uinit = {
> > +        .size = size,
> > +    };
> > +    UMem *umem;
> > +
> > +    assert((size % getpagesize()) == 0);
> > +    umem = g_new(UMem, 1);
> > +    umem->fd = open(DEV_UMEM, O_RDWR);
> > +    if (umem->fd < 0) {
> > +        perror("can't open "DEV_UMEM);
> > +        abort();
> 
> Can we return one error insntead of abort?  the same for the rest of the
> file aborts.

Ok.


> > +size_t umem_pages_size(uint64_t nr)
> > +{
> > +    return sizeof(struct umem_pages) + nr * sizeof(uint64_t);
> 
> Can we make sure that the pgoffs field is aligned?  I know that as it is
> now it is aligned, but better to be sure?

It is already done by gcc extension, zero length array.


> > +}
> > +
> > +static void umem_write_cmd(int fd, uint8_t cmd)
> > +{
> > +    DPRINTF("write cmd %c\n", cmd);
> > +
> > +    for (;;) {
> > +        ssize_t ret = write(fd, &cmd, 1);
> > +        if (ret == -1) {
> > +            if (errno == EINTR) {
> > +                continue;
> > +            } else if (errno == EPIPE) {
> > +                perror("pipe");
> > +                DPRINTF("write cmd %c %zd %d: pipe is closed\n",
> > +                        cmd, ret, errno);
> > +                break;
> > +            }
> 
> 
> Grr, we don't have a function that writes does a "safe_write".  The most
> similar thing in qemu looks to be send_all().

So we should introduce something like qemu_safe_write/read?


> > +
> > +            perror("pipe");
> 
> Can we make a different perror() message than previous error?
> 
> > +            DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno);
> > +            abort();
> > +        }
> > +
> > +        break;
> > +    }
> > +}
> > +
> > +static void umem_read_cmd(int fd, uint8_t expect)
> > +{
> > +    uint8_t cmd;
> > +    for (;;) {
> > +        ssize_t ret = read(fd, &cmd, 1);
> > +        if (ret == -1) {
> > +            if (errno == EINTR) {
> > +                continue;
> > +            }
> > +            perror("pipe");
> > +            DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno);
> > +            abort();
> > +        }
> > +
> > +        if (ret == 0) {
> > +            DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret);
> > +            abort();
> > +        }
> > +
> > +        break;
> > +    }
> > +
> > +    DPRINTF("read cmd %c\n", cmd);
> > +    if (cmd != expect) {
> > +        DPRINTF("cmd %c expect %d\n", cmd, expect);
> > +        abort();
> 
> Ouch.  If we receive garbage, we just exit?
> 
> I really think that we should implement error handling.
> 
> > +    }
> > +}
> > +
> > +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset)
> > +{
> > +    int ret;
> > +    uint64_t nr;
> > +    size_t size;
> > +    struct umem_pages *pages;
> > +
> > +    ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset);
> > +    *offset += sizeof(nr);
> > +    DPRINTF("ret %d nr %ld\n", ret, nr);
> > +    if (ret != sizeof(nr) || nr == 0) {
> > +        return NULL;
> > +    }
> > +
> > +    size = umem_pages_size(nr);
> > +    pages = g_malloc(size);
> 
> Just thinking about this.  Couldn't we just decide on a "big enough"
> buffer, and never send anything bigger than that?  That would remove the
> need to have to malloc()/free() a buffer for each reception?

Will try to address it.


> > +/* qemu side handler */
> > +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
> > +                                                int *offset)
> > +{
> > +    uint64_t i;
> > +    int page_shift = ffs(getpagesize()) - 1;
> > +    struct umem_pages *pages = umem_recv_pages(from_umemd, offset);
> > +    if (pages == NULL) {
> > +        return NULL;
> > +    }
> > +
> > +    for (i = 0; i < pages->nr; i++) {
> > +        ram_addr_t addr = pages->pgoffs[i] << page_shift;
> > +
> > +        /* make pages present by forcibly triggering page fault. */
> > +        volatile uint8_t *ram = qemu_get_ram_ptr(addr);
> > +        uint8_t dummy_read = ram[0];
> > +        (void)dummy_read;   /* suppress unused variable warning */
> > +    }
> > +
> > +    /*
> > +     * Very Linux implementation specific.
> > +     * Make it sure that other thread doesn't fault on the above virtual
> > +     * address. (More exactly other thread doesn't call fault handler with
> > +     * the offset.)
> > +     * the fault handler is called with mmap_sem read locked.
> > +     * madvise() does down/up_write(mmap_sem)
> > +     */
> > +    qemu_madvise(NULL, 0, MADV_NORMAL);
> 
> If it is linux specific, should be inside CONFIG_LINUX ifdef, or a
> function hided on some header.

Good idea.


> Talking about looking, what protects that no other thread enters this
> function before this one calls madvise?   Or I am losing something obvious?

It is assumed that only main thread calls this function via iohandler.


> > +
> > +struct umem_pages {
> > +    uint64_t nr;
> > +    uint64_t pgoffs[0];
> > +};
> > +
> 
> QEMU really likes typedefs for structs.
> 
> Later, Juan.
>
Juan Quintela June 16, 2012, 1:19 p.m. UTC | #3
Isaku Yamahata <yamahata@valinux.co.jp> wrote:
> On Thu, Jun 14, 2012 at 11:34:09PM +0200, Juan Quintela wrote:

>> > +size_t umem_pages_size(uint64_t nr)
>> > +{
>> > +    return sizeof(struct umem_pages) + nr * sizeof(uint64_t);
>> 
>> Can we make sure that the pgoffs field is aligned?  I know that as it is
>> now it is aligned, but better to be sure?
>
> It is already done by gcc extension, zero length array.

Ah, I didn't knew that propierty of the zero arrays extension.  thanks.

>> 
>> Grr, we don't have a function that writes does a "safe_write".  The most
>> similar thing in qemu looks to be send_all().
>
> So we should introduce something like qemu_safe_write/read?

I guess so.  If you look around, you will see that we have a lot of
cases where we have this pattern.  But that is not a problem ofthis
patch, was already there.

>
>> Talking about looking, what protects that no other thread enters this
>> function before this one calls madvise?   Or I am losing something obvious?
>
> It is assumed that only main thread calls this function via iohandler.

Ok.  Can we add a comment then?

Later, Juan.
diff mbox

Patch

diff --git a/umem.c b/umem.c
new file mode 100644
index 0000000..64eaab5
--- /dev/null
+++ b/umem.c
@@ -0,0 +1,364 @@ 
+/*
+ * umem.c: user process backed memory module for postcopy livemigration
+ *
+ * Copyright (c) 2011
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/umem.h>
+
+#include "bitops.h"
+#include "sysemu.h"
+#include "hw/hw.h"
+#include "umem.h"
+
+//#define DEBUG_UMEM
+#ifdef DEBUG_UMEM
+#include <sys/syscall.h>
+#define DPRINTF(format, ...)                                            \
+    do {                                                                \
+        printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid),    \
+               __func__, __LINE__, ## __VA_ARGS__);                     \
+    } while (0)
+#else
+#define DPRINTF(format, ...)    do { } while (0)
+#endif
+
+#define DEV_UMEM        "/dev/umem"
+
+UMem *umem_new(void *hostp, size_t size)
+{
+    struct umem_init uinit = {
+        .size = size,
+    };
+    UMem *umem;
+
+    assert((size % getpagesize()) == 0);
+    umem = g_new(UMem, 1);
+    umem->fd = open(DEV_UMEM, O_RDWR);
+    if (umem->fd < 0) {
+        perror("can't open "DEV_UMEM);
+        abort();
+    }
+
+    if (ioctl(umem->fd, UMEM_INIT, &uinit) < 0) {
+        perror("UMEM_INIT");
+        abort();
+    }
+    if (ftruncate(uinit.shmem_fd, uinit.size) < 0) {
+        perror("truncate(\"shmem_fd\")");
+        abort();
+    }
+
+    umem->nbits = 0;
+    umem->nsets = 0;
+    umem->faulted = NULL;
+    umem->page_shift = ffs(getpagesize()) - 1;
+    umem->shmem_fd = uinit.shmem_fd;
+    umem->size = uinit.size;
+    umem->umem = mmap(hostp, size, PROT_EXEC | PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_FIXED, umem->fd, 0);
+    if (umem->umem == MAP_FAILED) {
+        perror("mmap(UMem) failed");
+        abort();
+    }
+    return umem;
+}
+
+void umem_destroy(UMem *umem)
+{
+    if (umem->fd != -1) {
+        close(umem->fd);
+    }
+    if (umem->shmem_fd != -1) {
+        close(umem->shmem_fd);
+    }
+    g_free(umem->faulted);
+    g_free(umem);
+}
+
+void umem_get_page_request(UMem *umem, struct umem_pages *page_request)
+{
+    ssize_t ret = read(umem->fd, page_request->pgoffs,
+                       page_request->nr * sizeof(page_request->pgoffs[0]));
+    if (ret < 0) {
+        perror("daemon: umem read");
+        abort();
+    }
+    page_request->nr = ret / sizeof(page_request->pgoffs[0]);
+}
+
+void umem_mark_page_cached(UMem *umem, struct umem_pages *page_cached)
+{
+    const void *buf = page_cached->pgoffs;
+    ssize_t left = page_cached->nr * sizeof(page_cached->pgoffs[0]);
+
+    while (left > 0) {
+        ssize_t ret = write(umem->fd, buf, left);
+        if (ret == -1) {
+            if (errno == EINTR)
+                continue;
+
+            perror("daemon: umem write");
+            abort();
+        }
+
+        left -= ret;
+        buf += ret;
+    }
+}
+
+void umem_unmap(UMem *umem)
+{
+    munmap(umem->umem, umem->size);
+    umem->umem = NULL;
+}
+
+void umem_close(UMem *umem)
+{
+    close(umem->fd);
+    umem->fd = -1;
+}
+
+void *umem_map_shmem(UMem *umem)
+{
+    umem->nbits = umem->size >> umem->page_shift;
+    umem->nsets = 0;
+    umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits));
+
+    umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                       umem->shmem_fd, 0);
+    if (umem->shmem == MAP_FAILED) {
+        perror("daemon: mmap(\"shmem\")");
+        abort();
+    }
+    return umem->shmem;
+}
+
+void umem_unmap_shmem(UMem *umem)
+{
+    munmap(umem->shmem, umem->size);
+    umem->shmem = NULL;
+}
+
+void umem_remove_shmem(UMem *umem, size_t offset, size_t size)
+{
+    int s = offset >> umem->page_shift;
+    int e = (offset + size) >> umem->page_shift;
+    int i;
+
+    for (i = s; i < e; i++) {
+        if (!test_and_set_bit(i, umem->faulted)) {
+            umem->nsets++;
+#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE)
+            madvise(umem->shmem + offset, size, MADV_REMOVE);
+#endif
+        }
+    }
+}
+
+void umem_close_shmem(UMem *umem)
+{
+    close(umem->shmem_fd);
+    umem->shmem_fd = -1;
+}
+
+/***************************************************************************/
+/* qemu <-> umem daemon communication */
+
+size_t umem_pages_size(uint64_t nr)
+{
+    return sizeof(struct umem_pages) + nr * sizeof(uint64_t);
+}
+
+static void umem_write_cmd(int fd, uint8_t cmd)
+{
+    DPRINTF("write cmd %c\n", cmd);
+
+    for (;;) {
+        ssize_t ret = write(fd, &cmd, 1);
+        if (ret == -1) {
+            if (errno == EINTR) {
+                continue;
+            } else if (errno == EPIPE) {
+                perror("pipe");
+                DPRINTF("write cmd %c %zd %d: pipe is closed\n",
+                        cmd, ret, errno);
+                break;
+            }
+
+            perror("pipe");
+            DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno);
+            abort();
+        }
+
+        break;
+    }
+}
+
+static void umem_read_cmd(int fd, uint8_t expect)
+{
+    uint8_t cmd;
+    for (;;) {
+        ssize_t ret = read(fd, &cmd, 1);
+        if (ret == -1) {
+            if (errno == EINTR) {
+                continue;
+            }
+            perror("pipe");
+            DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno);
+            abort();
+        }
+
+        if (ret == 0) {
+            DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret);
+            abort();
+        }
+
+        break;
+    }
+
+    DPRINTF("read cmd %c\n", cmd);
+    if (cmd != expect) {
+        DPRINTF("cmd %c expect %d\n", cmd, expect);
+        abort();
+    }
+}
+
+struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset)
+{
+    int ret;
+    uint64_t nr;
+    size_t size;
+    struct umem_pages *pages;
+
+    ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset);
+    *offset += sizeof(nr);
+    DPRINTF("ret %d nr %ld\n", ret, nr);
+    if (ret != sizeof(nr) || nr == 0) {
+        return NULL;
+    }
+
+    size = umem_pages_size(nr);
+    pages = g_malloc(size);
+    pages->nr = nr;
+    size -= sizeof(pages->nr);
+
+    ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset);
+    *offset += size;
+    if (ret != size) {
+        g_free(pages);
+        return NULL;
+    }
+    return pages;
+}
+
+static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages)
+{
+    size_t len = umem_pages_size(pages->nr);
+    qemu_put_buffer(f, (const uint8_t*)pages, len);
+}
+
+/* umem daemon -> qemu */
+void umem_daemon_ready(int to_qemu_fd)
+{
+    umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY);
+}
+
+void umem_daemon_quit(QEMUFile *to_qemu)
+{
+    qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT);
+}
+
+void umem_daemon_send_pages_present(QEMUFile *to_qemu,
+                                    struct umem_pages *pages)
+{
+    qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT);
+    umem_send_pages(to_qemu, pages);
+}
+
+void umem_daemon_wait_for_qemu(int from_qemu_fd)
+{
+    umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY);
+}
+
+/* qemu -> umem daemon */
+void umem_qemu_wait_for_daemon(int from_umemd_fd)
+{
+    umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY);
+}
+
+void umem_qemu_ready(int to_umemd_fd)
+{
+    umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY);
+}
+
+void umem_qemu_quit(QEMUFile *to_umemd)
+{
+    qemu_put_byte(to_umemd, UMEM_QEMU_QUIT);
+}
+
+/* qemu side handler */
+struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
+                                                int *offset)
+{
+    uint64_t i;
+    int page_shift = ffs(getpagesize()) - 1;
+    struct umem_pages *pages = umem_recv_pages(from_umemd, offset);
+    if (pages == NULL) {
+        return NULL;
+    }
+
+    for (i = 0; i < pages->nr; i++) {
+        ram_addr_t addr = pages->pgoffs[i] << page_shift;
+
+        /* make pages present by forcibly triggering page fault. */
+        volatile uint8_t *ram = qemu_get_ram_ptr(addr);
+        uint8_t dummy_read = ram[0];
+        (void)dummy_read;   /* suppress unused variable warning */
+    }
+
+    /*
+     * Very Linux implementation specific.
+     * Make it sure that other thread doesn't fault on the above virtual
+     * address. (More exactly other thread doesn't call fault handler with
+     * the offset.)
+     * the fault handler is called with mmap_sem read locked.
+     * madvise() does down/up_write(mmap_sem)
+     */
+    qemu_madvise(NULL, 0, MADV_NORMAL);
+
+    return pages;
+}
+
+void umem_qemu_send_pages_present(QEMUFile *to_umemd,
+                                  const struct umem_pages *pages)
+{
+    qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED);
+    umem_send_pages(to_umemd, pages);
+}
+
+void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
+                                   const struct umem_pages *pages)
+{
+    qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED);
+    umem_send_pages(to_umemd, pages);
+}
diff --git a/umem.h b/umem.h
new file mode 100644
index 0000000..058cac6
--- /dev/null
+++ b/umem.h
@@ -0,0 +1,101 @@ 
+/*
+ * umem.h: user process backed memory module for postcopy livemigration
+ *
+ * Copyright (c) 2011
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_UMEM_H
+#define QEMU_UMEM_H
+
+#include <linux/umem.h>
+
+#include "qemu-common.h"
+
+typedef struct UMemDev UMemDev;
+
+struct UMem {
+    void *umem;
+    int fd;
+    void *shmem;
+    int shmem_fd;
+    uint64_t size;
+
+    /* indexed by host page size */
+    int page_shift;
+    int nbits;
+    int nsets;
+    unsigned long *faulted;
+};
+
+struct umem_pages {
+    uint64_t nr;
+    uint64_t pgoffs[0];
+};
+
+UMem *umem_new(void *hostp, size_t size);
+void umem_destroy(UMem *umem);
+
+/* umem device operations */
+void umem_get_page_request(UMem *umem, struct umem_pages *page_request);
+void umem_mark_page_cached(UMem *umem, struct umem_pages *page_cached);
+void umem_unmap(UMem *umem);
+void umem_close(UMem *umem);
+
+/* umem shmem operations */
+void *umem_map_shmem(UMem *umem);
+void umem_unmap_shmem(UMem *umem);
+void umem_remove_shmem(UMem *umem, size_t offset, size_t size);
+void umem_close_shmem(UMem *umem);
+
+/* qemu on source <-> umem daemon communication */
+
+/* daemon -> qemu */
+#define UMEM_DAEMON_READY               'R'
+#define UMEM_DAEMON_QUIT                'Q'
+#define UMEM_DAEMON_TRIGGER_PAGE_FAULT  'T'
+#define UMEM_DAEMON_ERROR               'E'
+
+/* qemu -> daemon */
+#define UMEM_QEMU_READY                 'r'
+#define UMEM_QEMU_QUIT                  'q'
+#define UMEM_QEMU_PAGE_FAULTED          't'
+#define UMEM_QEMU_PAGE_UNMAPPED         'u'
+
+struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset);
+size_t umem_pages_size(uint64_t nr);
+
+/* for umem daemon */
+void umem_daemon_ready(int to_qemu_fd);
+void umem_daemon_wait_for_qemu(int from_qemu_fd);
+void umem_daemon_quit(QEMUFile *to_qemu);
+void umem_daemon_send_pages_present(QEMUFile *to_qemu,
+                                    struct umem_pages *pages);
+
+/* for qemu */
+void umem_qemu_wait_for_daemon(int from_umemd_fd);
+void umem_qemu_ready(int to_umemd_fd);
+void umem_qemu_quit(QEMUFile *to_umemd);
+struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
+                                                int *offset);
+void umem_qemu_send_pages_present(QEMUFile *to_umemd,
+                                  const struct umem_pages *pages);
+void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
+                                   const struct umem_pages *pages);
+
+#endif /* QEMU_UMEM_H */