Message ID | 20110812110737.GA13791@valinux.co.jp |
---|---|
State | New |
Headers | show |
Sample user land program for testing the post copy chardevice. =========================================================================== /* * sample user land for post copy vmem * * Copyright (c) 2011, * National Institute of Advanced Industrial Science and Technology * * https://sites.google.com/site/grivonhome/quick-kvm-migration * Author: Isaku Yamahata <yamahata at valinux co jp> * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include <err.h> #include <inttypes.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/mman.h> #include <sys/ioctl.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> //#include <linux/kvm.h> #define __user #include "my-kvm.h" #if 1 #define DPRINTF(format, ...) \ printf("%s:%d "format, __func__, __LINE__, ## __VA_ARGS__) #else #define DPRINTF(format, ...) do { } while (0) #endif #define VMEM_NR_PAGES 8 void server(int vmem_fd, int shmem_fd, size_t size, size_t page_size) { int nr_pages = size / page_size; void* shmem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, shmem_fd, 0); if (shmem == MAP_FAILED) { err(EXIT_FAILURE, "server: mmap(\"shmem\")"); } close(shmem_fd); DPRINTF("KVM_VMEM_READY\n"); if (ioctl(vmem_fd, KVM_VMEM_READY) < 0) { err(EXIT_FAILURE, "server: KVM_VMEM_READY"); } struct kvm_vmem_page_request page_request; page_request.pgoffs = malloc(sizeof(*page_request.pgoffs) * nr_pages); if (page_request.pgoffs == NULL) { err(EXIT_FAILURE, "server: malloc(\"page_request.pgoffs\")"); } struct kvm_vmem_page_cached page_cached; page_cached.pgoffs = malloc(sizeof(*page_cached.pgoffs) * nr_pages); if (page_cached.pgoffs == NULL) { err(EXIT_FAILURE, "server: malloc(\"page_cached.pgoffs\")"); } int fill = 0; fill++; memset(shmem, fill, page_size); page_cached.nr = 1; page_cached.pgoffs[0] = 0; DPRINTF("KVM_VMEM_MARK_PAGE_CACHED\n"); if (ioctl(vmem_fd, KVM_VMEM_MARK_PAGE_CACHED, &page_cached)) { err(EXIT_FAILURE, "server: KVM_VMEM_MARK_PAGE_CACHED"); } struct kvm_vmem_page_range page_range = { .pgoff = 0, .nr_pages = 1, }; struct kvm_vmem_make_pages_present pages_present = { .nr = 1, .ranges = &page_range, }; DPRINTF("KVM_VMEM_MAKE_PAGES_PRESENT\n"); if (ioctl(vmem_fd, KVM_VMEM_MAKE_PAGES_PRESENT, &pages_present) < 0) { err(EXIT_FAILURE, "server: KVM_VMEM_MAKE_PAGES_PRESENT"); } int page_served = 1; while (page_served < nr_pages) { DPRINTF("KVM_VMEM_GET_PAGE_REQUEST\n"); page_request.nr = nr_pages; if (ioctl(vmem_fd, KVM_VMEM_GET_PAGE_REQUEST, &page_request)) { err(EXIT_FAILURE, "server: KVM_VMEM_GET_PAGE_REQUEST"); } DPRINTF("request.nr %d\n", page_request.nr); page_cached.nr = 0; int i; for (i = 0; i < page_request.nr; ++i) { memset(shmem + page_size * page_request.pgoffs[i], fill, page_size); fill++; page_cached.pgoffs[page_cached.nr] = page_request.pgoffs[i]; page_cached.nr++; DPRINTF("request[%d] %lx fill: %d\n", i, (unsigned long)page_request.pgoffs[i], fill - 1); } DPRINTF("KVM_VMEM_MARK_PAGE_CACHED\n"); if (ioctl(vmem_fd, KVM_VMEM_MARK_PAGE_CACHED, &page_cached) < 0) { err(EXIT_FAILURE, "server: KVM_VMEM_MARK_PAGE_CACHED"); } page_served += page_cached.nr; } #if 0 DPRINTF("KVM_VMEM_MAKE_VMA_ANONYMOUS\n"); if (ioctl(vmem_fd, KVM_VMEM_MAKE_VMA_ANONYMOUS)) { err(EXIT_FAILURE, "server: KVM_VMEM_MAKE_VMA_ANONYMOUS"); } #endif munmap(shmem, size); close(vmem_fd); } void qemu(int vmem_fd, size_t size, size_t page_size) { DPRINTF("mmap\n"); void *ram = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, vmem_fd, 0); if (ram == MAP_FAILED) { err(EXIT_FAILURE, "qemu: mmap"); } DPRINTF("KVM_VMEM_WAIT_READY\n"); if (ioctl(vmem_fd, KVM_VMEM_WAIT_READY) < 0) { err(EXIT_FAILURE, "qemu: KVM_VMEM_WAIT_READY"); } DPRINTF("close\n"); close(vmem_fd); int pages[] = {7, 1, 6, 2, 0, 5, 3, 4}; int val[VMEM_NR_PAGES]; int i; for (i = 0; i < VMEM_NR_PAGES; ++i) { if (i == 2 || i == 6) sleep(1); DPRINTF("access to %d\n", pages[i]); fflush(stdout); val[i] = *(uint8_t*)(ram + page_size * pages[i]); DPRINTF("page:%d val[i=%d]=%d\n", pages[i], i, val[i]); } munmap(ram, size); } int main(int argc, char **argv) { int kvm_fd = open("/dev/kvm", O_RDWR); if (kvm_fd < 0) { perror("can't open /dev/kvm"); exit(EXIT_FAILURE); } int vmem_dev_fd = ioctl(kvm_fd, KVM_CREATE_VMEM_DEV); if (vmem_dev_fd < 0) { err(EXIT_FAILURE, "can't create vmem_dev"); } long page_size = sysconf(_SC_PAGESIZE); struct kvm_vmem_create create = { .size = VMEM_NR_PAGES * page_size, }; if (ioctl(vmem_dev_fd, KVM_CREATE_VMEM, &create) < 0) { err(EXIT_FAILURE, "KVM_CREATE_VMEM"); } close(vmem_dev_fd); int vmem_fd = create.vmem_fd; int shmem_fd = create.shmem_fd; size_t size = create.size; if (ftruncate(shmem_fd, size) < 0) { err(EXIT_FAILURE, "truncate(\"shmem_fd\")"); } printf("vmem_fd %d shmem_fd %d\n", vmem_fd, shmem_fd); fflush(stdout); pid_t child = fork(); if (child < 0) { err(EXIT_FAILURE, "fork"); } if (child == 0) { sleep(1); printf("server pid: %d\n", getpid()); server(vmem_fd, shmem_fd, size, page_size); return 0; } printf("qemu pid: %d server pid: %d\n", getpid(), child); close(shmem_fd); qemu(vmem_fd, size, page_size); return 0; } ===========================================================================
On Fri, Aug 12, 2011 at 11:07 AM, Isaku Yamahata <yamahata@valinux.co.jp> wrote: > Here is the what I have right now for post copy chardevice. > The sample user land will follow. > It would give you more concrete idea and help further discussion, I hope. > This is just for discussion, so it's incomplete. > > I'm open to other ideas and quite happy to throw away this patch and > go for better way. > > thanks, > > From e262979e95b3c5a095c8cb0bc178309baa861a3f Mon Sep 17 00:00:00 2001 > Message-Id: <e262979e95b3c5a095c8cb0bc178309baa861a3f.1313146664.git.yamahata@valinux.co.jp> > From: Isaku Yamahata <yamahata@valinux.co.jp> > Date: Wed, 10 Aug 2011 18:28:05 +0900 > Subject: [PATCH] kvm/postcopy: chardevice for postcopy > > This is a character device to hook page access. > The page fault in the area is reported to another user process by > this chardriver. Then, the process fills the page contents and > resolves the page fault. > > Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp> > --- > arch/x86/kvm/Kconfig | 1 + > arch/x86/kvm/Makefile | 1 + > include/linux/kvm.h | 45 +++ > include/linux/kvm_host.h | 2 + > mm/memcontrol.c | 1 + > mm/shmem.c | 1 + > virt/kvm/Kconfig | 3 + > virt/kvm/kvm_main.c | 6 + > virt/kvm/vmem.c | 847 ++++++++++++++++++++++++++++++++++++++++++++++ > virt/kvm/vmem.h | 68 ++++ > 10 files changed, 975 insertions(+), 0 deletions(-) > create mode 100644 virt/kvm/vmem.c > create mode 100644 virt/kvm/vmem.h > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index 0a09b58..dcbd52e 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -29,6 +29,7 @@ config KVM > select HAVE_KVM_EVENTFD > select KVM_APIC_ARCHITECTURE > select KVM_ASYNC_PF > + select KVM_VMEM > select USER_RETURN_NOTIFIER > select KVM_MMIO > select TASKSTATS > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > index f15501f..6125f4c 100644 > --- a/arch/x86/kvm/Makefile > +++ b/arch/x86/kvm/Makefile > @@ -10,6 +10,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ > assigned-dev.o) > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) > +kvm-$(CONFIG_KVM_VMEM) += $(addprefix ../../../virt/kvm/, vmem.o) > > kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ > i8254.o timer.o > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index 55f5afb..623109e 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { > #define KVM_CAP_PPC_SMT 64 > #define KVM_CAP_PPC_RMA 65 > #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ > +#define KVM_CAP_POST_COPY_MEMORY 67 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -760,6 +761,50 @@ struct kvm_clock_data { > /* Available with KVM_CAP_RMA */ > #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) > > +struct kvm_vmem_create { > + __u64 size; /* in bytes */ > + __s32 vmem_fd; > + __s32 shmem_fd; > +}; > + > +struct kvm_vmem_page_request { > + __u32 nr; Padding will be needed here on 64 bit hosts unless the order is switched. > + __u64 __user *pgoffs; > +}; > + > +struct kvm_vmem_page_cached { > + __u32 nr; Also here. > + __u64 __user *pgoffs; > +}; > + > +struct kvm_vmem_page_range { > + __u64 pgoff; > + __u64 nr_pages; > +}; > + > +struct kvm_vmem_make_pages_present { > + __u32 nr; And here. > + struct kvm_vmem_page_range __user *ranges; > +}; > + > +/* Available with KVM_CAP_POST_COPY_MEMORY */ > +#define KVM_CREATE_VMEM_DEV _IO(KVMIO, 0xb0) > + > +/* ioctl for vmem_dev fd */ > +#define KVM_CREATE_VMEM _IOR(KVMIO, 0xb1, __u32) > + > +/* ioctl for vmem fd */ > +#define KVM_VMEM_WAIT_READY _IO(KVMIO, 0xb2) > +#define KVM_VMEM_READY _IO(KVMIO, 0xb3) > +#define KVM_VMEM_GET_PAGE_REQUEST \ > + _IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request) > +#define KVM_VMEM_MARK_PAGE_CACHED \ > + _IOW(KVMIO, 0xb5, struct kvm_vmem_page_cached) > +#define KVM_VMEM_MAKE_PAGES_PRESENT \ > + _IOW(KVMIO, 0xb6, struct kvm_vmem_make_pages_present) > +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7) > + > + > #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) > > struct kvm_assigned_pci_dev { > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index ff4d406..8b3dafa 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -222,6 +222,8 @@ struct kvm_irq_routing_table {}; > > #endif > > +long kvm_dev_ioctl_create_vmem_dev(void); > + > struct kvm_memslots { > int nmemslots; > u64 generation; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e013b8e..7f3fc4e 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2838,6 +2838,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, > > return ret; > } > +EXPORT_SYMBOL_GPL(mem_cgroup_cache_charge); > > /* > * While swap-in, try_charge -> commit or cancel, the page is locked. > diff --git a/mm/shmem.c b/mm/shmem.c > index fcedf54..ae7d61f 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -3035,6 +3035,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) > vma->vm_flags |= VM_CAN_NONLINEAR; > return 0; > } > +EXPORT_SYMBOL_GPL(shmem_zero_setup); > > /** > * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index f63ccb0..d3040ea 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -18,3 +18,6 @@ config KVM_MMIO > > config KVM_ASYNC_PF > bool > + > +config KVM_VMEM > + bool > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index aefdda3..9e47e20 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2184,6 +2184,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) > case KVM_CAP_SET_BOOT_CPU_ID: > #endif > case KVM_CAP_INTERNAL_ERROR_DATA: > + case KVM_CAP_POST_COPY_MEMORY: > return 1; > #ifdef CONFIG_HAVE_KVM_IRQCHIP > case KVM_CAP_IRQ_ROUTING: > @@ -2233,6 +2234,11 @@ static long kvm_dev_ioctl(struct file *filp, > case KVM_TRACE_DISABLE: > r = -EOPNOTSUPP; > break; > +#ifdef CONFIG_KVM_VMEM > + case KVM_CREATE_VMEM_DEV: > + r = kvm_dev_ioctl_create_vmem_dev(); > + break; > +#endif > default: > return kvm_arch_dev_ioctl(filp, ioctl, arg); > } > diff --git a/virt/kvm/vmem.c b/virt/kvm/vmem.c > new file mode 100644 > index 0000000..b413663 > --- /dev/null > +++ b/virt/kvm/vmem.c > @@ -0,0 +1,847 @@ > +/* > + * KVM post copy vmem > + * > + * Copyright (c) 2011, > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata <yamahata at valinux co jp> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along with > + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. The current address is: 51 Franklin Street, Fifth Floor Boston, MA 02110-1301 USA Then there is the version used in QEMU: if not, see <http://www.gnu.org/licenses/>. I don't know which one is preferred with kernel. > + */ > + > +#include <linux/kvm_host.h> > +#include <linux/kvm.h> > +#include <linux/pagemap.h> > +#include <linux/mm.h> > +#include <linux/memcontrol.h> > +#include <linux/poll.h> > +#include <linux/file.h> > +#include <linux/anon_inodes.h> > +#include "vmem.h" > + > +static void kvm_vmem_release_fake_vmf(int ret, struct vm_fault *fake_vmf) > +{ > + if (ret & VM_FAULT_LOCKED) { > + unlock_page(fake_vmf->page); > + } > + page_cache_release(fake_vmf->page); > +} > + > +static int kvm_vmem_minor_fault(struct kvm_vmem *vmem, > + struct vm_area_struct *vma, > + struct vm_fault *vmf) > +{ > + struct vm_fault fake_vmf; > + int ret; > + struct page *page; > + > + BUG_ON(!test_bit(vmf->pgoff, vmem->cached)); > + fake_vmf = *vmf; > + fake_vmf.page = NULL; > + ret = vmem->vma->vm_ops->fault(vmem->vma, &fake_vmf); > + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)) > + return ret; > + > + /* > + * TODO: pull out fake_vmf->page from shmem file and donate it > + * to this vma resolving the page fault. > + * vmf->page = fake_vmf->page; > + */ > + > + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); > + if (!page) > + return VM_FAULT_OOM; > + if (mem_cgroup_cache_charge(page, vma->vm_mm, GFP_KERNEL)) { > + kvm_vmem_release_fake_vmf(ret, &fake_vmf); > + page_cache_release(page); > + return VM_FAULT_OOM; > + } > + > + copy_highpage(page, fake_vmf.page); > + kvm_vmem_release_fake_vmf(ret, &fake_vmf); > + > + ret |= VM_FAULT_LOCKED; > + SetPageUptodate(page); > + vmf->page = page; > + set_bit(vmf->pgoff, vmem->faulted); > + > + return ret; > +} > + > +static int kvm_vmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > +{ > + struct file *filp = vma->vm_file; > + struct kvm_vmem *vmem = filp->private_data; > + > + if (vmf->pgoff >= vmem->pgoff_end) { > + return VM_FAULT_SIGBUS; > + } > + > + BUG_ON(test_bit(vmf->pgoff, vmem->faulted)); > + > + if (!test_bit(vmf->pgoff, vmem->cached)) { > + /* major fault */ > + unsigned long bit; > + DEFINE_WAIT(wait); > + > + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { > + /* async page fault */ > + spin_lock(&vmem->lock); > + if (vmem->async_req_nr < vmem->async_req_max) { > + vmem->async_req[vmem->async_req_nr] = > + vmf->pgoff; > + vmem->async_req_nr++; > + } > + spin_unlock(&vmem->lock); > + wake_up_poll(&vmem->req_wait, POLLIN); > + > + if (test_bit(vmf->pgoff, vmem->cached)) > + return kvm_vmem_minor_fault(vmem, vma, vmf); > + return VM_FAULT_MAJOR | VM_FAULT_RETRY; > + } > + > + spin_lock(&vmem->lock); > + bit = find_first_zero_bit(vmem->sync_wait_bitmap, > + vmem->sync_req_max); > + if (likely(bit < vmem->sync_req_max)) { > + vmem->sync_req[bit] = vmf->pgoff; > + prepare_to_wait(&vmem->page_wait[bit], &wait, > + TASK_UNINTERRUPTIBLE); > + set_bit(bit, vmem->sync_req_bitmap); > + set_bit(bit, vmem->sync_wait_bitmap); > + spin_unlock(&vmem->lock); > + wake_up_poll(&vmem->req_wait, POLLIN); > + > + if (!test_bit(vmf->pgoff, vmem->cached)) > + schedule(); > + finish_wait(&vmem->page_wait[bit], &wait); > + clear_bit(bit, vmem->sync_wait_bitmap); > + } else { > + struct kvm_vmem_page_req_list page_req_list = { > + .pgoff = vmf->pgoff, > + }; > + vmem->req_list_nr++; > + list_add_tail(&page_req_list.list, &vmem->req_list); > + wake_up_poll(&vmem->req_wait, POLLIN); > + for (;;) { > + prepare_to_wait(&vmem->req_list_wait, &wait, > + TASK_UNINTERRUPTIBLE); > + if (test_bit(vmf->pgoff, vmem->cached)) { > + vmem->req_list_nr--; > + break; > + } > + spin_unlock(&vmem->lock); > + schedule(); > + spin_lock(&vmem->lock); > + } > + spin_unlock(&vmem->lock); > + finish_wait(&vmem->req_list_wait, &wait); > + } > + > + return kvm_vmem_minor_fault(vmem, vma, vmf) | VM_FAULT_MAJOR; > + } > + > + return kvm_vmem_minor_fault(vmem, vma, vmf); > +} > + > +/* for partial munmap */ > +static void kvm_vmem_vma_open(struct vm_area_struct *vma) > +{ > + struct file *filp = vma->vm_file; > + struct kvm_vmem *vmem = filp->private_data; > + > + spin_lock(&vmem->lock); > + vmem->vma_nr++; > + spin_unlock(&vmem->lock); > +} > + > +static void kvm_vmem_vma_close(struct vm_area_struct *vma) > +{ > + struct file *filp = vma->vm_file; > + struct kvm_vmem *vmem = filp->private_data; > + struct task_struct *task = NULL; > + > + spin_lock(&vmem->lock); > + vmem->vma_nr--; > + if (vmem->vma_nr == 0) { > + task = vmem->task; > + vmem->task = NULL; > + } > + spin_unlock(&vmem->lock); > + > + if (task) > + put_task_struct(task); > +} > + > +static const struct vm_operations_struct kvm_vmem_vm_ops = { > + .open = kvm_vmem_vma_open, > + .close = kvm_vmem_vma_close, > + .fault = kvm_vmem_fault, > +}; > + > +static int kvm_vmem_mmap(struct file *filp, struct vm_area_struct *vma) > +{ > + struct kvm_vmem *vmem = filp->private_data; > + int error; > + > + /* allow mmap() only once */ > + spin_lock(&vmem->lock); > + if (vmem->mmapped) { > + error = -EBUSY; > + goto out; > + } > + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff > > + vmem->pgoff_end) { > + error = -EINVAL; > + goto out; > + } > + > + vmem->mmapped = true; > + vmem->vma_nr = 1; > + vmem->vm_start = vma->vm_start; > + get_task_struct(current); > + vmem->task = current; > + spin_unlock(&vmem->lock); > + > + vma->vm_ops = &kvm_vmem_vm_ops; > + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; > + vma->vm_flags &= ~VM_SHARED; > + return 0; > + > +out: > + spin_unlock(&vmem->lock); > + return error; > +} > + > +static bool kvm_vmem_req_pending(struct kvm_vmem* vmem) > +{ > + return !list_empty(&vmem->req_list) || > + !bitmap_empty(vmem->sync_req_bitmap, vmem->sync_req_max) || > + (vmem->async_req_nr > 0); > +} > + > +static unsigned int kvm_vmem_poll(struct file* filp, poll_table *wait) > +{ > + struct kvm_vmem *vmem = filp->private_data; > + unsigned int events = 0; > + > + poll_wait(filp, &vmem->req_wait, wait); > + > + spin_lock(&vmem->lock); > + if (kvm_vmem_req_pending(vmem)) > + events |= POLLIN; > + spin_unlock(&vmem->lock); > + > + return events; > +} > + > +/* > + * return value > + * true: finished > + * false: more request > + */ > +static bool kvm_vmem_copy_page_request(struct kvm_vmem *vmem, > + pgoff_t *pgoffs, int req_max, > + int *req_nr) > +{ > + struct kvm_vmem_page_req_list *req_list; > + struct kvm_vmem_page_req_list *tmp; > + > + unsigned long bit; > + > + *req_nr = 0; > + list_for_each_entry_safe(req_list, tmp, &vmem->req_list, list) { > + list_del(&req_list->list); > + pgoffs[*req_nr] = req_list->pgoff; > + (*req_nr)++; > + if (*req_nr >= req_max) > + return false; > + } > + > + bit = 0; > + for (;;) { > + bit = find_next_bit(vmem->sync_req_bitmap, vmem->sync_req_max, > + bit); > + if (bit >= vmem->sync_req_max) > + break; > + pgoffs[*req_nr] = vmem->sync_req[bit]; > + (*req_nr)++; > + clear_bit(bit, vmem->sync_req_bitmap); > + if (*req_nr >= req_max) > + return false; > + bit++; > + } > + > + if (vmem->async_req_nr > 0) { > + int nr = min(req_max - *req_nr, vmem->async_req_nr); > + memcpy(pgoffs + *req_nr, vmem->async_req, > + sizeof(*vmem->async_req) * nr); > + vmem->async_req_nr -= nr; > + *req_nr += nr; > + memmove(vmem->async_req, vmem->sync_req + nr, > + vmem->async_req_nr * sizeof(*vmem->async_req)); > + > + } > + return vmem->async_req_nr == 0; > +} > + > +static int kvm_vmem_get_page_request(struct kvm_vmem *vmem, > + struct kvm_vmem_page_request *page_req) > +{ > + DEFINE_WAIT(wait); > +#define REQ_MAX ((__u32)32) > + pgoff_t pgoffs[REQ_MAX]; > + __u32 req_copied = 0; > + int ret = 0; > + > + spin_lock(&vmem->lock); > + for (;;) { > + prepare_to_wait(&vmem->req_wait, &wait, TASK_INTERRUPTIBLE); > + if (kvm_vmem_req_pending(vmem)) { > + break; > + } > + if (signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + spin_unlock(&vmem->lock); > + schedule(); > + spin_lock(&vmem->lock); > + } > + finish_wait(&vmem->req_wait, &wait); > + if (ret) > + goto out_unlock; > + > + while (req_copied < page_req->nr) { > + int req_max; > + int req_nr; > + bool finished; > + req_max = min(page_req->nr - req_copied, REQ_MAX); > + finished = kvm_vmem_copy_page_request(vmem, pgoffs, req_max, > + &req_nr); > + > + spin_unlock(&vmem->lock); > + > + if (req_nr > 0) { > + ret = 0; > + if (copy_to_user(page_req->pgoffs + req_copied, pgoffs, > + sizeof(*pgoffs) * req_nr)) { > + ret = -EFAULT; > + goto out; > + } > + } > + req_copied += req_nr; > + if (finished) > + goto out; > + > + spin_lock(&vmem->lock); > + } > + > +out_unlock: > + spin_unlock(&vmem->lock); > +out: > + page_req->nr = req_copied; > + return ret; > +} > + > +static int kvm_vmem_mark_page_cached(struct kvm_vmem *vmem, > + struct kvm_vmem_page_cached *page_cached) > +{ > + int ret = 0; > +#define PG_MAX ((__u32)32) > + __u64 pgoffs[PG_MAX]; > + __u32 nr; > + unsigned long bit; > + bool wake_up_list = false; > + > + nr = 0; > + while (nr < page_cached->nr) { > + __u32 todo = min(PG_MAX, (page_cached->nr - nr)); > + int i; > + > + if (copy_from_user(pgoffs, page_cached->pgoffs + nr, > + sizeof(*pgoffs) * todo)) { > + ret = -EFAULT; > + goto out; > + } > + for (i = 0; i < todo; ++i) { > + if (pgoffs[i] >= vmem->pgoff_end) { > + ret = -EINVAL; > + goto out; > + } > + set_bit(pgoffs[i], vmem->cached); > + } > + nr += todo; > + } > + > + spin_lock(&vmem->lock); > + bit = 0; > + for (;;) { > + bit = find_next_bit(vmem->sync_wait_bitmap, vmem->sync_req_max, > + bit); > + if (bit >= vmem->sync_req_max) > + break; > + if (test_bit(vmem->sync_req[bit], vmem->cached)) > + wake_up(&vmem->page_wait[bit]); > + bit++; > + } > + > + if (vmem->req_list_nr > 0) > + wake_up_list = true; > + spin_unlock(&vmem->lock); > + > + if (wake_up_list) > + wake_up_all(&vmem->req_list_wait); > + > +out: > + return ret; > +} > + > +static bool kvm_vmem_is_vmem_vma(const struct kvm_vmem *vmem, > + const struct vm_area_struct *vma) > +{ > + return vma->vm_file && vma->vm_file->private_data == vmem; > +} > + > +static void kvm_vmem_make_pages_present_entry(struct kvm_vmem *vmem, > + struct kvm_vmem_page_range *range, > + struct task_struct *task, > + struct mm_struct *mm, > + unsigned long vm_start) > +{ > + unsigned long pgoff = range->pgoff; > + unsigned long range_end = range->pgoff + range->nr_pages; > + > + down_read(&mm->mmap_sem); > + > + while (pgoff < range->pgoff + range->nr_pages) { > + unsigned long pgoff_end; > + struct vm_area_struct *vma; > + unsigned long saddr; > + unsigned long eaddr; > + > + /* search unfaulted range */ > + spin_lock(&vmem->lock); > + pgoff = find_next_zero_bit(vmem->faulted, range_end, pgoff); > + if (pgoff >= range_end) { > + spin_unlock(&vmem->lock); > + break; > + } > + pgoff_end = find_next_bit(vmem->faulted, range_end, pgoff); > + spin_unlock(&vmem->lock); > + > + saddr = vm_start + (pgoff << PAGE_SHIFT); > + eaddr = vm_start + (pgoff_end << PAGE_SHIFT); > + vma = find_vma(mm, saddr); > + if (vma == NULL) { > + break; > + } > + if (eaddr < vma->vm_start) { > + pgoff = (vma->vm_start - vm_start) >> PAGE_SHIFT; > + continue; > + } > + > + if (kvm_vmem_is_vmem_vma(vmem, vma)) { > + unsigned long start = max(vma->vm_start, saddr); > + unsigned long end = min(vma->vm_end, eaddr); > + int nr_pages = (end - start) >> PAGE_SHIFT; > + get_user_pages(task, mm, start, nr_pages, > + 1, 1, NULL, NULL); > + pgoff = (end - vm_start) >> PAGE_SHIFT; > + } else { > + pgoff = (vma->vm_end - vm_start) >> PAGE_SHIFT; > + } > + } > + > + up_read(&mm->mmap_sem); > +} > + > +static int kvm_vmem_make_pages_present( > + struct kvm_vmem *vmem, > + struct kvm_vmem_make_pages_present *pages_present) > +{ > + struct task_struct *task; > + struct mm_struct *mm; > + pgoff_t pgoff_end; > + unsigned long vm_start; > + unsigned long vm_eaddr; > + > +#define NUM_ENTRIES ((__u32)32) > + struct kvm_vmem_page_range kranges[NUM_ENTRIES]; > + __u32 nr = 0; > + int ret; > + > + spin_lock(&vmem->lock); > + task = vmem->task; > + pgoff_end = vmem->pgoff_end; > + vm_start = vmem->vm_start; > + vm_eaddr = vm_start + vmem->size; > + spin_unlock(&vmem->lock); > + if (task == NULL) > + return 0; > + mm = get_task_mm(task); > + if (mm == NULL) > + return 0; > + > + ret = 0; > + while (nr < pages_present->nr) { > + int nr_ranges = min(NUM_ENTRIES, pages_present->nr - nr); > + int i; > + > + if (copy_from_user(&kranges, pages_present->ranges + nr, > + sizeof(kranges[0]) * nr_ranges)) { > + ret = -EFAULT; > + break; > + } > + for (i = 0; i < nr_ranges; ++i) { > + struct kvm_vmem_page_range *range = &kranges[i]; > + if (range->pgoff >= pgoff_end || > + range->nr_pages >= pgoff_end || > + range->pgoff + range->nr_pages >= pgoff_end) { > + ret = -EINVAL; > + break; > + } > + kvm_vmem_make_pages_present_entry(vmem, range, > + task, mm, vm_start); > + } > + nr += nr_ranges; > + } > + > + mmput(mm); > + return ret; > +} > + > +static int kvm_vmem_make_vma_anonymous(struct kvm_vmem *vmem) > +{ > +#if 1 > + return -ENOSYS; > +#else > + unsigned long saddr; > + unsigned long eaddr; > + unsigned long addr; > + unsigned long bit; > + struct task_struct *task; > + struct mm_struct *mm; > + > + spin_lock(&vmem->lock); > + task = vmem->task; > + saddr = vmem->vm_start; > + eaddr = saddr + vmem->size; > + bit = find_first_zero_bit(vmem->faulted, vmem->pgoff_end); > + if (bit < vmem->pgoff_end) { > + spin_unlock(&vmem->lock); > + return -EBUSY; > + } > + spin_unlock(&vmem->lock); > + if (task == NULL) > + return 0; > + mm = get_task_mm(task); > + if (mm == NULL) > + return 0; > + > + addr = saddr; > + down_write(&mm->mmap_sem); > + while (addr < eaddr) { > + struct vm_area_struct *vma; > + vma = find_vma(mm, addr); > + if (kvm_vmem_is_vmem_vma(vmem, vma)) { > + /* XXX incorrect. race/locking and more fix up */ > + struct file *filp = vma->vm_file; > + vma->vm_ops->close(vma); > + vma->vm_ops = NULL; > + vma->vm_file = NULL; > + /* vma->vm_flags */ > + fput(filp); > + } > + addr = vma->vm_end; > + } > + up_write(&mm->mmap_sem); > + > + mmput(mm); > + return 0; > +#endif > +} > + > +static void kvm_vmem_ready(struct kvm_vmem *vmem) > +{ > + spin_lock(&vmem->lock); > + vmem->ready = true; > + spin_unlock(&vmem->lock); > + wake_up_interruptible(&vmem->ready_wait); > +} > + > +static int kvm_vmem_wait_ready(struct kvm_vmem *vmem) > +{ > + int ret = 0; > + DEFINE_WAIT(wait); > + > + spin_lock(&vmem->lock); > + for (;;) { > + prepare_to_wait(&vmem->ready_wait, &wait, TASK_INTERRUPTIBLE); > + if (vmem->ready) { > + break; > + } > + if (signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + spin_unlock(&vmem->lock); > + schedule(); > + spin_lock(&vmem->lock); > + } > + spin_unlock(&vmem->lock); > + finish_wait(&vmem->ready_wait, &wait); > + return ret; > +} > + > +static long kvm_vmem_ioctl(struct file *filp, unsigned int ioctl, > + unsigned long arg) > +{ > + struct kvm_vmem *vmem = filp->private_data; > + void __user *argp = (void __user *) arg; > + long ret = 0; > + > + switch (ioctl) { > + case KVM_VMEM_READY: > + kvm_vmem_ready(vmem); > + ret = 0; > + break; > + case KVM_VMEM_WAIT_READY: > + ret = kvm_vmem_wait_ready(vmem); > + break; > + case KVM_VMEM_GET_PAGE_REQUEST: { > + struct kvm_vmem_page_request page_request; > + ret = -EFAULT; > + if (copy_from_user(&page_request, argp, sizeof(page_request))) > + break; > + ret = kvm_vmem_get_page_request(vmem, &page_request); > + if (ret == 0 && > + copy_to_user(argp + > + offsetof(struct kvm_vmem_page_request, nr), > + &page_request.nr, > + sizeof(page_request.nr))) { > + ret = -EFAULT; > + break; > + } > + break; > + } > + case KVM_VMEM_MARK_PAGE_CACHED: { > + struct kvm_vmem_page_cached page_cached; > + ret = -EFAULT; > + if (copy_from_user(&page_cached, argp, sizeof(page_cached))) > + break; > + ret = kvm_vmem_mark_page_cached(vmem, &page_cached); > + break; > + } > + case KVM_VMEM_MAKE_PAGES_PRESENT: { > + struct kvm_vmem_make_pages_present pages_present; > + ret = -EFAULT; > + if (copy_from_user(&pages_present, argp, > + sizeof(pages_present))) > + break; > + ret = kvm_vmem_make_pages_present(vmem, &pages_present); > + break; > + } > + case KVM_VMEM_MAKE_VMA_ANONYMOUS: > + ret = kvm_vmem_make_vma_anonymous(vmem); > + break; > + default: > + ret = -EINVAL; > + break; > + } > + return ret; > +} > + > +static unsigned long kvm_vmem_bitmap_bytes(const struct kvm_vmem *vmem) > +{ > + return round_up(vmem->pgoff_end, BITS_PER_LONG) / 8; > +} > + > +static void kvm_vmem_free(struct kvm_vmem *vmem) > +{ > + if (vmem->task) { > + put_task_struct(vmem->task); > + vmem->task = NULL; > + } > + > + if (vmem->shmem_filp) > + fput(vmem->shmem_filp); > + if (kvm_vmem_bitmap_bytes(vmem) > PAGE_SIZE) { > + vfree(vmem->cached); > + vfree(vmem->faulted); > + } else { > + kfree(vmem->cached); > + kfree(vmem->faulted); > + } > + kfree(vmem->vma); > + kfree(vmem->async_req); > + kfree(vmem->sync_req_bitmap); > + kfree(vmem->sync_wait_bitmap); > + kfree(vmem->page_wait); > + kfree(vmem->sync_req); > + kfree(vmem); > +} > + > +static int kvm_vmem_release(struct inode *inode, struct file *filp) > +{ > + struct kvm_vmem *vmem = filp->private_data; > + kvm_vmem_free(vmem); > + return 0; > +} > + > +static struct file_operations kvm_vmem_fops = { > + .release = kvm_vmem_release, > + .unlocked_ioctl = kvm_vmem_ioctl, > + .mmap = kvm_vmem_mmap, > + .poll = kvm_vmem_poll, > + .llseek = noop_llseek, > +}; > + > +static int kvm_create_vmem(struct kvm_vmem_create *create) > +{ > + int error = 0; > + struct kvm_vmem *vmem = NULL; > + struct vm_area_struct *vma = NULL; > + int shmem_fd; > + unsigned long bitmap_bytes; > + unsigned long sync_bitmap_bytes; > + int i; > + > + vmem = kzalloc(sizeof(*vmem), GFP_KERNEL); > + vmem->task = NULL; Is this needed, doesn't kzalloc() return zeroed memory? > + vmem->mmapped = false; > + spin_lock_init(&vmem->lock); > + vmem->size = roundup(create->size, PAGE_SIZE); > + vmem->pgoff_end = vmem->size >> PAGE_SHIFT; > + init_waitqueue_head(&vmem->req_wait); > + > + vma = kzalloc(sizeof(*vma), GFP_KERNEL); > + vma->vm_start = 0; Also here. > + vma->vm_end = vmem->size; > + /* this shmem file is used for temporal buffer for pages > + so it's unlikely that so many pages exists in this shmem file */ > + vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY | > + VM_DONTEXPAND; > + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); > + vma->vm_pgoff = 0; > + INIT_LIST_HEAD(&vma->anon_vma_chain); > + > + vmem->vma = vma; > + > + shmem_fd = get_unused_fd(); > + if (shmem_fd < 0) { > + error = shmem_fd; > + goto out; > + } > + error = shmem_zero_setup(vma); > + if (error < 0) { > + put_unused_fd(shmem_fd); > + goto out; > + } > + vmem->shmem_filp = vma->vm_file; > + get_file(vmem->shmem_filp); > + fd_install(shmem_fd, vma->vm_file); > + create->shmem_fd = shmem_fd; > + > + create->vmem_fd = anon_inode_getfd("kvm-vmem", > + &kvm_vmem_fops, vmem, O_RDWR); > + if (create->vmem_fd < 0) { > + error = create->vmem_fd; > + goto out; > + } > + > + bitmap_bytes = kvm_vmem_bitmap_bytes(vmem); > + if (bitmap_bytes > PAGE_SIZE) { > + vmem->cached = vzalloc(bitmap_bytes); > + vmem->faulted = vzalloc(bitmap_bytes); > + } else { > + vmem->cached = kzalloc(bitmap_bytes, GFP_KERNEL); > + vmem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL); > + } > + > +#define ASYNC_REQ_MAX (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS) > + vmem->async_req_max = ASYNC_REQ_MAX; > + vmem->async_req_nr = 0; > + vmem->async_req = kzalloc(sizeof(*vmem->async_req), GFP_KERNEL); > + > +#define SYNC_REQ_MAX (KVM_MAX_VCPUS) > + vmem->sync_req_max = round_up(SYNC_REQ_MAX, BITS_PER_LONG); > + sync_bitmap_bytes = sizeof(unsigned long) * > + (vmem->sync_req_max / BITS_PER_LONG); > + vmem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); > + vmem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); > + vmem->page_wait = kzalloc(sizeof(*vmem->page_wait) * > + vmem->sync_req_max, GFP_KERNEL); > + for (i = 0; i < vmem->sync_req_max; ++i) > + init_waitqueue_head(&vmem->page_wait[i]); > + vmem->sync_req = kzalloc(sizeof(*vmem->sync_req) * > + vmem->sync_req_max, GFP_KERNEL); > + > + vmem->req_list_nr = 0; > + INIT_LIST_HEAD(&vmem->req_list); > + init_waitqueue_head(&vmem->req_list_wait); > + > + init_waitqueue_head(&vmem->ready_wait); > + vmem->ready = false; > + > + return 0; > + > + out: > + kvm_vmem_free(vmem); > + return error; > +} > + > +static long kvm_vmem_dev_ioctl(struct file *filp, unsigned int ioctl, > + unsigned long arg) > +{ > + void __user *argp = (void __user *) arg; > + long ret; > + > + switch (ioctl) { > + case KVM_CREATE_VMEM: { > + struct kvm_vmem_create create; > + if (copy_from_user(&create, argp, sizeof(create))) { > + ret = -EFAULT; > + break; > + } > + ret = kvm_create_vmem(&create); > + if (copy_to_user(argp, &create, sizeof(create))) { > + ret = -EFAULT; > + break; > + } > + break; > + } > + default: > + ret = -EINVAL; > + break; > + } > + return ret; > +} > + > +static int kvm_vmem_dev_release(struct inode *inode, struct file *filp) > +{ > + return 0; > +} > + > +static struct file_operations kvm_vmem_dev_fops = { > + .release = kvm_vmem_dev_release, > + .unlocked_ioctl = kvm_vmem_dev_ioctl, > +}; > + > +long kvm_dev_ioctl_create_vmem_dev(void) > +{ > + return anon_inode_getfd("kvm-vmem-dev", &kvm_vmem_dev_fops, > + NULL, O_RDWR); > +} > diff --git a/virt/kvm/vmem.h b/virt/kvm/vmem.h > new file mode 100644 > index 0000000..bc7e8cf > --- /dev/null > +++ b/virt/kvm/vmem.h > @@ -0,0 +1,68 @@ > +/* > + * KVM post copy vmem > + * > + * Copyright (c) 2011, > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata <yamahata at valinux co jp> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along with > + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. Old address also here. > + */ > + > +#ifndef __KVM_VMEM_H__ > +#define __KVM_VMEM_H__ > + > +struct kvm_vmem_page_req_list { > + struct list_head list; > + pgoff_t pgoff; > +}; > + > +struct kvm_vmem { > + loff_t size; > + pgoff_t pgoff_end; > + spinlock_t lock; > + > + wait_queue_head_t req_wait; > + > + int async_req_max; > + int async_req_nr; > + pgoff_t *async_req; > + > + int sync_req_max; 'int' between pointers would mean 4 bytes of structure padding on 64 bit hosts. > + unsigned long *sync_req_bitmap; > + unsigned long *sync_wait_bitmap; > + pgoff_t *sync_req; > + wait_queue_head_t *page_wait; > + > + int req_list_nr; > + struct list_head req_list; > + wait_queue_head_t req_list_wait; > + > + unsigned long *cached; > + unsigned long *faulted; > + > + bool mmapped; > + unsigned long vm_start; > + unsigned int vma_nr; > + struct task_struct *task; > + > + wait_queue_head_t ready_wait; > + bool ready; > + > + struct file *shmem_filp; > + struct vm_area_struct *vma; > +}; > + > +#endif /* __KVM_VMEM_H__ */ > -- > 1.7.1.1 > > > -- > yamahata > >
On 08/12/2011 04:07 AM, Isaku Yamahata wrote: > This is a character device to hook page access. > The page fault in the area is reported to another user process by > this chardriver. Then, the process fills the page contents and > resolves the page fault. Have you considered CUSE (character device in userspace, fs/fuse/cuse.c)? > index 55f5afb..623109e 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { > #define KVM_CAP_PPC_SMT 64 > #define KVM_CAP_PPC_RMA 65 > #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ > +#define KVM_CAP_POST_COPY_MEMORY 67 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -760,6 +761,50 @@ struct kvm_clock_data { > /* Available with KVM_CAP_RMA */ > #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) > > +struct kvm_vmem_create { > + __u64 size; /* in bytes */ > + __s32 vmem_fd; > + __s32 shmem_fd; > +}; Should really be outside kvm.h (and virt/kvm), since it's not kvm specific. > + > +struct kvm_vmem_page_request { > + __u32 nr; > + __u64 __user *pgoffs; > +}; > + > +struct kvm_vmem_page_cached { > + __u32 nr; > + __u64 __user *pgoffs; > +}; > + > +struct kvm_vmem_page_range { > + __u64 pgoff; > + __u64 nr_pages; > +}; > + > +struct kvm_vmem_make_pages_present { > + __u32 nr; > + struct kvm_vmem_page_range __user *ranges; > +}; This is madvise(MADV_WILLNEED), is it not? > + > +/* Available with KVM_CAP_POST_COPY_MEMORY */ > +#define KVM_CREATE_VMEM_DEV _IO(KVMIO, 0xb0) > + > +/* ioctl for vmem_dev fd */ > +#define KVM_CREATE_VMEM _IOR(KVMIO, 0xb1, __u32) > + > +/* ioctl for vmem fd */ > +#define KVM_VMEM_WAIT_READY _IO(KVMIO, 0xb2) > +#define KVM_VMEM_READY _IO(KVMIO, 0xb3) > +#define KVM_VMEM_GET_PAGE_REQUEST \ > + _IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request) > +#define KVM_VMEM_MARK_PAGE_CACHED \ > + _IOW(KVMIO, 0xb5, struct kvm_vmem_page_cached) > +#define KVM_VMEM_MAKE_PAGES_PRESENT \ > + _IOW(KVMIO, 0xb6, struct kvm_vmem_make_pages_present) > +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7) Can you explain these in some more detail?
On Mon, Aug 15, 2011 at 12:29:37PM -0700, Avi Kivity wrote: > On 08/12/2011 04:07 AM, Isaku Yamahata wrote: >> This is a character device to hook page access. >> The page fault in the area is reported to another user process by >> this chardriver. Then, the process fills the page contents and >> resolves the page fault. > > Have you considered CUSE (character device in userspace, fs/fuse/cuse.c)? By looking at dev.c and cuse.c, it doesn't seem to support mmap and fault handler. > >> index 55f5afb..623109e 100644 >> --- a/include/linux/kvm.h >> +++ b/include/linux/kvm.h >> @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { >> #define KVM_CAP_PPC_SMT 64 >> #define KVM_CAP_PPC_RMA 65 >> #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ >> +#define KVM_CAP_POST_COPY_MEMORY 67 >> >> #ifdef KVM_CAP_IRQ_ROUTING >> >> @@ -760,6 +761,50 @@ struct kvm_clock_data { >> /* Available with KVM_CAP_RMA */ >> #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) >> >> +struct kvm_vmem_create { >> + __u64 size; /* in bytes */ >> + __s32 vmem_fd; >> + __s32 shmem_fd; >> +}; > > Should really be outside kvm.h (and virt/kvm), since it's not kvm specific. Okay. I'll un-kvm it. >> + >> +struct kvm_vmem_page_request { >> + __u32 nr; >> + __u64 __user *pgoffs; >> +}; >> + >> +struct kvm_vmem_page_cached { >> + __u32 nr; >> + __u64 __user *pgoffs; >> +}; >> + >> +struct kvm_vmem_page_range { >> + __u64 pgoff; >> + __u64 nr_pages; >> +}; >> + >> +struct kvm_vmem_make_pages_present { >> + __u32 nr; >> + struct kvm_vmem_page_range __user *ranges; >> +}; > > This is madvise(MADV_WILLNEED), is it not? Another process, not qemu process, issues it, and it make the pages are present in qemu process address space. >> + >> +/* Available with KVM_CAP_POST_COPY_MEMORY */ >> +#define KVM_CREATE_VMEM_DEV _IO(KVMIO, 0xb0) >> + >> +/* ioctl for vmem_dev fd */ >> +#define KVM_CREATE_VMEM _IOR(KVMIO, 0xb1, __u32) >> + >> +/* ioctl for vmem fd */ >> +#define KVM_VMEM_WAIT_READY _IO(KVMIO, 0xb2) >> +#define KVM_VMEM_READY _IO(KVMIO, 0xb3) >> +#define KVM_VMEM_GET_PAGE_REQUEST \ >> + _IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request) >> +#define KVM_VMEM_MARK_PAGE_CACHED \ >> + _IOW(KVMIO, 0xb5, struct kvm_vmem_page_cached) >> +#define KVM_VMEM_MAKE_PAGES_PRESENT \ >> + _IOW(KVMIO, 0xb6, struct kvm_vmem_make_pages_present) >> +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7) > > Can you explain these in some more detail? KVM_CRATE_VMEM_DEV: create vmem-dev device from kvm device for qemu KVM_CREATE_VMEM: create vmem device from vmem-dev device. (note:qemu creates more than one memory region.) KVM_VMEM_WAIT_READY: wait for KVM_VMEM_READY for qemu KVM_VMEM_READY: unblock KVM_VMEM_WAIT_READY for daemon uses These are for qemu and daemon to synchronise to enter postcopy stage. KVM_VMEM_GET_PAGE_REQUEST: retrieve page fault of qemu process KVM_VMEM_MARK_PAGE_CACHED: mark the specified pages pulled from the source for daemon KVM_VMEM_MAKE_PAGES_PRESENT: make the specified pages present in qemu virtual address space for daemon uses KVM_VMEM_MAKE_VMA_ANONYMOUS: make the specified vma in the qemu process anonymous I'm not sure whether this can be implemented or not. I think The following the work flow on the destination helps. qemu on the destination | V open(/dev/kvm) | V KVM_CREATE_VMEM_DEV | V Here we have two file descriptors to vmem device and shmem file | | | daemon on the destination V fork()---------------------------------------, | | V | close(socket) V close(shmem) mmap(shmem file) | | V V mmap(vmem device) for guest RAM close(shmem file) | | V | KVM_VMEM_READY_WAIT <---------------------KVM_VMEM_READY | | V | close(vmem device) Here the daemon takes over | the owner of the socket entering post copy stage to the source start guest execution | | | V V access guest RAM KVM_VMEM_GET_PAGE_REQUEST | | V V page fault ------------------------------>page offset is returned block | V pull page from the source write the page contents to the shmem. | V unblock <-----------------------------KVM_VMEM_MARK_PAGE_CACHED the fault handler returns the page page fault is resolved | | pages can be pulled | backgroundly | | | V | KVM_VMEM_MARK_PAGE_CACHED | | V V The specified pages<----------------------KVM_VMEM_MAKE_PAGES_PRESENT are made present | so future page fault is avoided. | | | V V all the pages are pulled from the source | | V V the vma becomes anonymous<----------------KVM_VMEM_MAKE_VMA_ANONYMOUS (note: I'm not sure if this can be implemented or not) | | V V migration completes exit() thanks, -- yamahata
On 08/15/2011 06:42 PM, Isaku Yamahata wrote: > On Mon, Aug 15, 2011 at 12:29:37PM -0700, Avi Kivity wrote: > > On 08/12/2011 04:07 AM, Isaku Yamahata wrote: > >> This is a character device to hook page access. > >> The page fault in the area is reported to another user process by > >> this chardriver. Then, the process fills the page contents and > >> resolves the page fault. > > > > Have you considered CUSE (character device in userspace, fs/fuse/cuse.c)? > > By looking at dev.c and cuse.c, it doesn't seem to support mmap and > fault handler. If performance is sufficient, this would be the preferred path. Enhance an existing API which can be useful to others, rather than add a new one. > >> + > >> +struct kvm_vmem_make_pages_present { > >> + __u32 nr; > >> + struct kvm_vmem_page_range __user *ranges; > >> +}; > > > > This is madvise(MADV_WILLNEED), is it not? > > Another process, not qemu process, issues it, > and it make the pages are present in qemu process address space. That process just issues these calls in a loop until all memory is present, yes? it seems those few lines could be easily added to qemu. > > > > Can you explain these in some more detail? > > > KVM_CRATE_VMEM_DEV: create vmem-dev device from kvm device > for qemu > KVM_CREATE_VMEM: create vmem device from vmem-dev device. > (note:qemu creates more than one memory region.) > > > KVM_VMEM_WAIT_READY: wait for KVM_VMEM_READY > for qemu > KVM_VMEM_READY: unblock KVM_VMEM_WAIT_READY > for daemon uses > These are for qemu and daemon to synchronise to enter postcopy stage. This are eliminated if we fold the daemon into qemu. Also, could just a semaphore or other synchronization mechanism. > > KVM_VMEM_GET_PAGE_REQUEST: retrieve page fault of qemu process Equivalent to the fault callback of CUSE (if we add it)? > KVM_VMEM_MARK_PAGE_CACHED: mark the specified pages pulled from the source > for daemon Equivalent to returning from that callback with a new page? > KVM_VMEM_MAKE_PAGES_PRESENT: make the specified pages present in qemu > virtual address space > for daemon uses > KVM_VMEM_MAKE_VMA_ANONYMOUS: make the specified vma in the qemu process > anonymous > I'm not sure whether this can be implemented > or not. > > I think The following the work flow on the destination helps. > > qemu on the destination > | > V > open(/dev/kvm) > | > V > KVM_CREATE_VMEM_DEV > | > V > Here we have two file descriptors to > vmem device and shmem file > | > | > | daemon on the destination > V > fork()---------------------------------------, > | | > V | > close(socket) V > close(shmem) mmap(shmem file) > | | > V V > mmap(vmem device) for guest RAM close(shmem file) > | | > V | > KVM_VMEM_READY_WAIT<---------------------KVM_VMEM_READY > | | > V | > close(vmem device) Here the daemon takes over > | the owner of the socket > entering post copy stage to the source > start guest execution | > | | > V V > access guest RAM KVM_VMEM_GET_PAGE_REQUEST > | | > V V > page fault ------------------------------>page offset is returned > block | > V > pull page from the source > write the page contents > to the shmem. > | > V > unblock<-----------------------------KVM_VMEM_MARK_PAGE_CACHED > the fault handler returns the page > page fault is resolved > | > | pages can be pulled > | backgroundly > | | > | V > | KVM_VMEM_MARK_PAGE_CACHED > | | > V V > The specified pages<----------------------KVM_VMEM_MAKE_PAGES_PRESENT > are made present | > so future page fault is avoided. | > | | > V V > > all the pages are pulled from the source > > | | > V V > the vma becomes anonymous<----------------KVM_VMEM_MAKE_VMA_ANONYMOUS > (note: I'm not sure if this can be implemented or not) > | | > V V > migration completes exit() > Yes, thanks, this was very helpful.
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 0a09b58..dcbd52e 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select KVM_ASYNC_PF + select KVM_VMEM select USER_RETURN_NOTIFIER select KVM_MMIO select TASKSTATS diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..6125f4c 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -10,6 +10,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ assigned-dev.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) +kvm-$(CONFIG_KVM_VMEM) += $(addprefix ../../../virt/kvm/, vmem.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o timer.o diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 55f5afb..623109e 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA 65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define KVM_CAP_POST_COPY_MEMORY 67 #ifdef KVM_CAP_IRQ_ROUTING @@ -760,6 +761,50 @@ struct kvm_clock_data { /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) +struct kvm_vmem_create { + __u64 size; /* in bytes */ + __s32 vmem_fd; + __s32 shmem_fd; +}; + +struct kvm_vmem_page_request { + __u32 nr; + __u64 __user *pgoffs; +}; + +struct kvm_vmem_page_cached { + __u32 nr; + __u64 __user *pgoffs; +}; + +struct kvm_vmem_page_range { + __u64 pgoff; + __u64 nr_pages; +}; + +struct kvm_vmem_make_pages_present { + __u32 nr; + struct kvm_vmem_page_range __user *ranges; +}; + +/* Available with KVM_CAP_POST_COPY_MEMORY */ +#define KVM_CREATE_VMEM_DEV _IO(KVMIO, 0xb0) + +/* ioctl for vmem_dev fd */ +#define KVM_CREATE_VMEM _IOR(KVMIO, 0xb1, __u32) + +/* ioctl for vmem fd */ +#define KVM_VMEM_WAIT_READY _IO(KVMIO, 0xb2) +#define KVM_VMEM_READY _IO(KVMIO, 0xb3) +#define KVM_VMEM_GET_PAGE_REQUEST \ + _IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request) +#define KVM_VMEM_MARK_PAGE_CACHED \ + _IOW(KVMIO, 0xb5, struct kvm_vmem_page_cached) +#define KVM_VMEM_MAKE_PAGES_PRESENT \ + _IOW(KVMIO, 0xb6, struct kvm_vmem_make_pages_present) +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7) + + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) struct kvm_assigned_pci_dev { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ff4d406..8b3dafa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -222,6 +222,8 @@ struct kvm_irq_routing_table {}; #endif +long kvm_dev_ioctl_create_vmem_dev(void); + struct kvm_memslots { int nmemslots; u64 generation; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e..7f3fc4e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2838,6 +2838,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return ret; } +EXPORT_SYMBOL_GPL(mem_cgroup_cache_charge); /* * While swap-in, try_charge -> commit or cancel, the page is locked. diff --git a/mm/shmem.c b/mm/shmem.c index fcedf54..ae7d61f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3035,6 +3035,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } +EXPORT_SYMBOL_GPL(shmem_zero_setup); /** * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index f63ccb0..d3040ea 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -18,3 +18,6 @@ config KVM_MMIO config KVM_ASYNC_PF bool + +config KVM_VMEM + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aefdda3..9e47e20 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2184,6 +2184,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_SET_BOOT_CPU_ID: #endif case KVM_CAP_INTERNAL_ERROR_DATA: + case KVM_CAP_POST_COPY_MEMORY: return 1; #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQ_ROUTING: @@ -2233,6 +2234,11 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_TRACE_DISABLE: r = -EOPNOTSUPP; break; +#ifdef CONFIG_KVM_VMEM + case KVM_CREATE_VMEM_DEV: + r = kvm_dev_ioctl_create_vmem_dev(); + break; +#endif default: return kvm_arch_dev_ioctl(filp, ioctl, arg); } diff --git a/virt/kvm/vmem.c b/virt/kvm/vmem.c new file mode 100644 index 0000000..b413663 --- /dev/null +++ b/virt/kvm/vmem.c @@ -0,0 +1,847 @@ +/* + * KVM post copy vmem + * + * Copyright (c) 2011, + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/pagemap.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include "vmem.h" + +static void kvm_vmem_release_fake_vmf(int ret, struct vm_fault *fake_vmf) +{ + if (ret & VM_FAULT_LOCKED) { + unlock_page(fake_vmf->page); + } + page_cache_release(fake_vmf->page); +} + +static int kvm_vmem_minor_fault(struct kvm_vmem *vmem, + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct vm_fault fake_vmf; + int ret; + struct page *page; + + BUG_ON(!test_bit(vmf->pgoff, vmem->cached)); + fake_vmf = *vmf; + fake_vmf.page = NULL; + ret = vmem->vma->vm_ops->fault(vmem->vma, &fake_vmf); + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)) + return ret; + + /* + * TODO: pull out fake_vmf->page from shmem file and donate it + * to this vma resolving the page fault. + * vmf->page = fake_vmf->page; + */ + + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!page) + return VM_FAULT_OOM; + if (mem_cgroup_cache_charge(page, vma->vm_mm, GFP_KERNEL)) { + kvm_vmem_release_fake_vmf(ret, &fake_vmf); + page_cache_release(page); + return VM_FAULT_OOM; + } + + copy_highpage(page, fake_vmf.page); + kvm_vmem_release_fake_vmf(ret, &fake_vmf); + + ret |= VM_FAULT_LOCKED; + SetPageUptodate(page); + vmf->page = page; + set_bit(vmf->pgoff, vmem->faulted); + + return ret; +} + +static int kvm_vmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *filp = vma->vm_file; + struct kvm_vmem *vmem = filp->private_data; + + if (vmf->pgoff >= vmem->pgoff_end) { + return VM_FAULT_SIGBUS; + } + + BUG_ON(test_bit(vmf->pgoff, vmem->faulted)); + + if (!test_bit(vmf->pgoff, vmem->cached)) { + /* major fault */ + unsigned long bit; + DEFINE_WAIT(wait); + + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { + /* async page fault */ + spin_lock(&vmem->lock); + if (vmem->async_req_nr < vmem->async_req_max) { + vmem->async_req[vmem->async_req_nr] = + vmf->pgoff; + vmem->async_req_nr++; + } + spin_unlock(&vmem->lock); + wake_up_poll(&vmem->req_wait, POLLIN); + + if (test_bit(vmf->pgoff, vmem->cached)) + return kvm_vmem_minor_fault(vmem, vma, vmf); + return VM_FAULT_MAJOR | VM_FAULT_RETRY; + } + + spin_lock(&vmem->lock); + bit = find_first_zero_bit(vmem->sync_wait_bitmap, + vmem->sync_req_max); + if (likely(bit < vmem->sync_req_max)) { + vmem->sync_req[bit] = vmf->pgoff; + prepare_to_wait(&vmem->page_wait[bit], &wait, + TASK_UNINTERRUPTIBLE); + set_bit(bit, vmem->sync_req_bitmap); + set_bit(bit, vmem->sync_wait_bitmap); + spin_unlock(&vmem->lock); + wake_up_poll(&vmem->req_wait, POLLIN); + + if (!test_bit(vmf->pgoff, vmem->cached)) + schedule(); + finish_wait(&vmem->page_wait[bit], &wait); + clear_bit(bit, vmem->sync_wait_bitmap); + } else { + struct kvm_vmem_page_req_list page_req_list = { + .pgoff = vmf->pgoff, + }; + vmem->req_list_nr++; + list_add_tail(&page_req_list.list, &vmem->req_list); + wake_up_poll(&vmem->req_wait, POLLIN); + for (;;) { + prepare_to_wait(&vmem->req_list_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (test_bit(vmf->pgoff, vmem->cached)) { + vmem->req_list_nr--; + break; + } + spin_unlock(&vmem->lock); + schedule(); + spin_lock(&vmem->lock); + } + spin_unlock(&vmem->lock); + finish_wait(&vmem->req_list_wait, &wait); + } + + return kvm_vmem_minor_fault(vmem, vma, vmf) | VM_FAULT_MAJOR; + } + + return kvm_vmem_minor_fault(vmem, vma, vmf); +} + +/* for partial munmap */ +static void kvm_vmem_vma_open(struct vm_area_struct *vma) +{ + struct file *filp = vma->vm_file; + struct kvm_vmem *vmem = filp->private_data; + + spin_lock(&vmem->lock); + vmem->vma_nr++; + spin_unlock(&vmem->lock); +} + +static void kvm_vmem_vma_close(struct vm_area_struct *vma) +{ + struct file *filp = vma->vm_file; + struct kvm_vmem *vmem = filp->private_data; + struct task_struct *task = NULL; + + spin_lock(&vmem->lock); + vmem->vma_nr--; + if (vmem->vma_nr == 0) { + task = vmem->task; + vmem->task = NULL; + } + spin_unlock(&vmem->lock); + + if (task) + put_task_struct(task); +} + +static const struct vm_operations_struct kvm_vmem_vm_ops = { + .open = kvm_vmem_vma_open, + .close = kvm_vmem_vma_close, + .fault = kvm_vmem_fault, +}; + +static int kvm_vmem_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct kvm_vmem *vmem = filp->private_data; + int error; + + /* allow mmap() only once */ + spin_lock(&vmem->lock); + if (vmem->mmapped) { + error = -EBUSY; + goto out; + } + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff > + vmem->pgoff_end) { + error = -EINVAL; + goto out; + } + + vmem->mmapped = true; + vmem->vma_nr = 1; + vmem->vm_start = vma->vm_start; + get_task_struct(current); + vmem->task = current; + spin_unlock(&vmem->lock); + + vma->vm_ops = &kvm_vmem_vm_ops; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_flags &= ~VM_SHARED; + return 0; + +out: + spin_unlock(&vmem->lock); + return error; +} + +static bool kvm_vmem_req_pending(struct kvm_vmem* vmem) +{ + return !list_empty(&vmem->req_list) || + !bitmap_empty(vmem->sync_req_bitmap, vmem->sync_req_max) || + (vmem->async_req_nr > 0); +} + +static unsigned int kvm_vmem_poll(struct file* filp, poll_table *wait) +{ + struct kvm_vmem *vmem = filp->private_data; + unsigned int events = 0; + + poll_wait(filp, &vmem->req_wait, wait); + + spin_lock(&vmem->lock); + if (kvm_vmem_req_pending(vmem)) + events |= POLLIN; + spin_unlock(&vmem->lock); + + return events; +} + +/* + * return value + * true: finished + * false: more request + */ +static bool kvm_vmem_copy_page_request(struct kvm_vmem *vmem, + pgoff_t *pgoffs, int req_max, + int *req_nr) +{ + struct kvm_vmem_page_req_list *req_list; + struct kvm_vmem_page_req_list *tmp; + + unsigned long bit; + + *req_nr = 0; + list_for_each_entry_safe(req_list, tmp, &vmem->req_list, list) { + list_del(&req_list->list); + pgoffs[*req_nr] = req_list->pgoff; + (*req_nr)++; + if (*req_nr >= req_max) + return false; + } + + bit = 0; + for (;;) { + bit = find_next_bit(vmem->sync_req_bitmap, vmem->sync_req_max, + bit); + if (bit >= vmem->sync_req_max) + break; + pgoffs[*req_nr] = vmem->sync_req[bit]; + (*req_nr)++; + clear_bit(bit, vmem->sync_req_bitmap); + if (*req_nr >= req_max) + return false; + bit++; + } + + if (vmem->async_req_nr > 0) { + int nr = min(req_max - *req_nr, vmem->async_req_nr); + memcpy(pgoffs + *req_nr, vmem->async_req, + sizeof(*vmem->async_req) * nr); + vmem->async_req_nr -= nr; + *req_nr += nr; + memmove(vmem->async_req, vmem->sync_req + nr, + vmem->async_req_nr * sizeof(*vmem->async_req)); + + } + return vmem->async_req_nr == 0; +} + +static int kvm_vmem_get_page_request(struct kvm_vmem *vmem, + struct kvm_vmem_page_request *page_req) +{ + DEFINE_WAIT(wait); +#define REQ_MAX ((__u32)32) + pgoff_t pgoffs[REQ_MAX]; + __u32 req_copied = 0; + int ret = 0; + + spin_lock(&vmem->lock); + for (;;) { + prepare_to_wait(&vmem->req_wait, &wait, TASK_INTERRUPTIBLE); + if (kvm_vmem_req_pending(vmem)) { + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + spin_unlock(&vmem->lock); + schedule(); + spin_lock(&vmem->lock); + } + finish_wait(&vmem->req_wait, &wait); + if (ret) + goto out_unlock; + + while (req_copied < page_req->nr) { + int req_max; + int req_nr; + bool finished; + req_max = min(page_req->nr - req_copied, REQ_MAX); + finished = kvm_vmem_copy_page_request(vmem, pgoffs, req_max, + &req_nr); + + spin_unlock(&vmem->lock); + + if (req_nr > 0) { + ret = 0; + if (copy_to_user(page_req->pgoffs + req_copied, pgoffs, + sizeof(*pgoffs) * req_nr)) { + ret = -EFAULT; + goto out; + } + } + req_copied += req_nr; + if (finished) + goto out; + + spin_lock(&vmem->lock); + } + +out_unlock: + spin_unlock(&vmem->lock); +out: + page_req->nr = req_copied; + return ret; +} + +static int kvm_vmem_mark_page_cached(struct kvm_vmem *vmem, + struct kvm_vmem_page_cached *page_cached) +{ + int ret = 0; +#define PG_MAX ((__u32)32) + __u64 pgoffs[PG_MAX]; + __u32 nr; + unsigned long bit; + bool wake_up_list = false; + + nr = 0; + while (nr < page_cached->nr) { + __u32 todo = min(PG_MAX, (page_cached->nr - nr)); + int i; + + if (copy_from_user(pgoffs, page_cached->pgoffs + nr, + sizeof(*pgoffs) * todo)) { + ret = -EFAULT; + goto out; + } + for (i = 0; i < todo; ++i) { + if (pgoffs[i] >= vmem->pgoff_end) { + ret = -EINVAL; + goto out; + } + set_bit(pgoffs[i], vmem->cached); + } + nr += todo; + } + + spin_lock(&vmem->lock); + bit = 0; + for (;;) { + bit = find_next_bit(vmem->sync_wait_bitmap, vmem->sync_req_max, + bit); + if (bit >= vmem->sync_req_max) + break; + if (test_bit(vmem->sync_req[bit], vmem->cached)) + wake_up(&vmem->page_wait[bit]); + bit++; + } + + if (vmem->req_list_nr > 0) + wake_up_list = true; + spin_unlock(&vmem->lock); + + if (wake_up_list) + wake_up_all(&vmem->req_list_wait); + +out: + return ret; +} + +static bool kvm_vmem_is_vmem_vma(const struct kvm_vmem *vmem, + const struct vm_area_struct *vma) +{ + return vma->vm_file && vma->vm_file->private_data == vmem; +} + +static void kvm_vmem_make_pages_present_entry(struct kvm_vmem *vmem, + struct kvm_vmem_page_range *range, + struct task_struct *task, + struct mm_struct *mm, + unsigned long vm_start) +{ + unsigned long pgoff = range->pgoff; + unsigned long range_end = range->pgoff + range->nr_pages; + + down_read(&mm->mmap_sem); + + while (pgoff < range->pgoff + range->nr_pages) { + unsigned long pgoff_end; + struct vm_area_struct *vma; + unsigned long saddr; + unsigned long eaddr; + + /* search unfaulted range */ + spin_lock(&vmem->lock); + pgoff = find_next_zero_bit(vmem->faulted, range_end, pgoff); + if (pgoff >= range_end) { + spin_unlock(&vmem->lock); + break; + } + pgoff_end = find_next_bit(vmem->faulted, range_end, pgoff); + spin_unlock(&vmem->lock); + + saddr = vm_start + (pgoff << PAGE_SHIFT); + eaddr = vm_start + (pgoff_end << PAGE_SHIFT); + vma = find_vma(mm, saddr); + if (vma == NULL) { + break; + } + if (eaddr < vma->vm_start) { + pgoff = (vma->vm_start - vm_start) >> PAGE_SHIFT; + continue; + } + + if (kvm_vmem_is_vmem_vma(vmem, vma)) { + unsigned long start = max(vma->vm_start, saddr); + unsigned long end = min(vma->vm_end, eaddr); + int nr_pages = (end - start) >> PAGE_SHIFT; + get_user_pages(task, mm, start, nr_pages, + 1, 1, NULL, NULL); + pgoff = (end - vm_start) >> PAGE_SHIFT; + } else { + pgoff = (vma->vm_end - vm_start) >> PAGE_SHIFT; + } + } + + up_read(&mm->mmap_sem); +} + +static int kvm_vmem_make_pages_present( + struct kvm_vmem *vmem, + struct kvm_vmem_make_pages_present *pages_present) +{ + struct task_struct *task; + struct mm_struct *mm; + pgoff_t pgoff_end; + unsigned long vm_start; + unsigned long vm_eaddr; + +#define NUM_ENTRIES ((__u32)32) + struct kvm_vmem_page_range kranges[NUM_ENTRIES]; + __u32 nr = 0; + int ret; + + spin_lock(&vmem->lock); + task = vmem->task; + pgoff_end = vmem->pgoff_end; + vm_start = vmem->vm_start; + vm_eaddr = vm_start + vmem->size; + spin_unlock(&vmem->lock); + if (task == NULL) + return 0; + mm = get_task_mm(task); + if (mm == NULL) + return 0; + + ret = 0; + while (nr < pages_present->nr) { + int nr_ranges = min(NUM_ENTRIES, pages_present->nr - nr); + int i; + + if (copy_from_user(&kranges, pages_present->ranges + nr, + sizeof(kranges[0]) * nr_ranges)) { + ret = -EFAULT; + break; + } + for (i = 0; i < nr_ranges; ++i) { + struct kvm_vmem_page_range *range = &kranges[i]; + if (range->pgoff >= pgoff_end || + range->nr_pages >= pgoff_end || + range->pgoff + range->nr_pages >= pgoff_end) { + ret = -EINVAL; + break; + } + kvm_vmem_make_pages_present_entry(vmem, range, + task, mm, vm_start); + } + nr += nr_ranges; + } + + mmput(mm); + return ret; +} + +static int kvm_vmem_make_vma_anonymous(struct kvm_vmem *vmem) +{ +#if 1 + return -ENOSYS; +#else + unsigned long saddr; + unsigned long eaddr; + unsigned long addr; + unsigned long bit; + struct task_struct *task; + struct mm_struct *mm; + + spin_lock(&vmem->lock); + task = vmem->task; + saddr = vmem->vm_start; + eaddr = saddr + vmem->size; + bit = find_first_zero_bit(vmem->faulted, vmem->pgoff_end); + if (bit < vmem->pgoff_end) { + spin_unlock(&vmem->lock); + return -EBUSY; + } + spin_unlock(&vmem->lock); + if (task == NULL) + return 0; + mm = get_task_mm(task); + if (mm == NULL) + return 0; + + addr = saddr; + down_write(&mm->mmap_sem); + while (addr < eaddr) { + struct vm_area_struct *vma; + vma = find_vma(mm, addr); + if (kvm_vmem_is_vmem_vma(vmem, vma)) { + /* XXX incorrect. race/locking and more fix up */ + struct file *filp = vma->vm_file; + vma->vm_ops->close(vma); + vma->vm_ops = NULL; + vma->vm_file = NULL; + /* vma->vm_flags */ + fput(filp); + } + addr = vma->vm_end; + } + up_write(&mm->mmap_sem); + + mmput(mm); + return 0; +#endif +} + +static void kvm_vmem_ready(struct kvm_vmem *vmem) +{ + spin_lock(&vmem->lock); + vmem->ready = true; + spin_unlock(&vmem->lock); + wake_up_interruptible(&vmem->ready_wait); +} + +static int kvm_vmem_wait_ready(struct kvm_vmem *vmem) +{ + int ret = 0; + DEFINE_WAIT(wait); + + spin_lock(&vmem->lock); + for (;;) { + prepare_to_wait(&vmem->ready_wait, &wait, TASK_INTERRUPTIBLE); + if (vmem->ready) { + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + spin_unlock(&vmem->lock); + schedule(); + spin_lock(&vmem->lock); + } + spin_unlock(&vmem->lock); + finish_wait(&vmem->ready_wait, &wait); + return ret; +} + +static long kvm_vmem_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct kvm_vmem *vmem = filp->private_data; + void __user *argp = (void __user *) arg; + long ret = 0; + + switch (ioctl) { + case KVM_VMEM_READY: + kvm_vmem_ready(vmem); + ret = 0; + break; + case KVM_VMEM_WAIT_READY: + ret = kvm_vmem_wait_ready(vmem); + break; + case KVM_VMEM_GET_PAGE_REQUEST: { + struct kvm_vmem_page_request page_request; + ret = -EFAULT; + if (copy_from_user(&page_request, argp, sizeof(page_request))) + break; + ret = kvm_vmem_get_page_request(vmem, &page_request); + if (ret == 0 && + copy_to_user(argp + + offsetof(struct kvm_vmem_page_request, nr), + &page_request.nr, + sizeof(page_request.nr))) { + ret = -EFAULT; + break; + } + break; + } + case KVM_VMEM_MARK_PAGE_CACHED: { + struct kvm_vmem_page_cached page_cached; + ret = -EFAULT; + if (copy_from_user(&page_cached, argp, sizeof(page_cached))) + break; + ret = kvm_vmem_mark_page_cached(vmem, &page_cached); + break; + } + case KVM_VMEM_MAKE_PAGES_PRESENT: { + struct kvm_vmem_make_pages_present pages_present; + ret = -EFAULT; + if (copy_from_user(&pages_present, argp, + sizeof(pages_present))) + break; + ret = kvm_vmem_make_pages_present(vmem, &pages_present); + break; + } + case KVM_VMEM_MAKE_VMA_ANONYMOUS: + ret = kvm_vmem_make_vma_anonymous(vmem); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static unsigned long kvm_vmem_bitmap_bytes(const struct kvm_vmem *vmem) +{ + return round_up(vmem->pgoff_end, BITS_PER_LONG) / 8; +} + +static void kvm_vmem_free(struct kvm_vmem *vmem) +{ + if (vmem->task) { + put_task_struct(vmem->task); + vmem->task = NULL; + } + + if (vmem->shmem_filp) + fput(vmem->shmem_filp); + if (kvm_vmem_bitmap_bytes(vmem) > PAGE_SIZE) { + vfree(vmem->cached); + vfree(vmem->faulted); + } else { + kfree(vmem->cached); + kfree(vmem->faulted); + } + kfree(vmem->vma); + kfree(vmem->async_req); + kfree(vmem->sync_req_bitmap); + kfree(vmem->sync_wait_bitmap); + kfree(vmem->page_wait); + kfree(vmem->sync_req); + kfree(vmem); +} + +static int kvm_vmem_release(struct inode *inode, struct file *filp) +{ + struct kvm_vmem *vmem = filp->private_data; + kvm_vmem_free(vmem); + return 0; +} + +static struct file_operations kvm_vmem_fops = { + .release = kvm_vmem_release, + .unlocked_ioctl = kvm_vmem_ioctl, + .mmap = kvm_vmem_mmap, + .poll = kvm_vmem_poll, + .llseek = noop_llseek, +}; + +static int kvm_create_vmem(struct kvm_vmem_create *create) +{ + int error = 0; + struct kvm_vmem *vmem = NULL; + struct vm_area_struct *vma = NULL; + int shmem_fd; + unsigned long bitmap_bytes; + unsigned long sync_bitmap_bytes; + int i; + + vmem = kzalloc(sizeof(*vmem), GFP_KERNEL); + vmem->task = NULL; + vmem->mmapped = false; + spin_lock_init(&vmem->lock); + vmem->size = roundup(create->size, PAGE_SIZE); + vmem->pgoff_end = vmem->size >> PAGE_SHIFT; + init_waitqueue_head(&vmem->req_wait); + + vma = kzalloc(sizeof(*vma), GFP_KERNEL); + vma->vm_start = 0; + vma->vm_end = vmem->size; + /* this shmem file is used for temporal buffer for pages + so it's unlikely that so many pages exists in this shmem file */ + vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY | + VM_DONTEXPAND; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + vma->vm_pgoff = 0; + INIT_LIST_HEAD(&vma->anon_vma_chain); + + vmem->vma = vma; + + shmem_fd = get_unused_fd(); + if (shmem_fd < 0) { + error = shmem_fd; + goto out; + } + error = shmem_zero_setup(vma); + if (error < 0) { + put_unused_fd(shmem_fd); + goto out; + } + vmem->shmem_filp = vma->vm_file; + get_file(vmem->shmem_filp); + fd_install(shmem_fd, vma->vm_file); + create->shmem_fd = shmem_fd; + + create->vmem_fd = anon_inode_getfd("kvm-vmem", + &kvm_vmem_fops, vmem, O_RDWR); + if (create->vmem_fd < 0) { + error = create->vmem_fd; + goto out; + } + + bitmap_bytes = kvm_vmem_bitmap_bytes(vmem); + if (bitmap_bytes > PAGE_SIZE) { + vmem->cached = vzalloc(bitmap_bytes); + vmem->faulted = vzalloc(bitmap_bytes); + } else { + vmem->cached = kzalloc(bitmap_bytes, GFP_KERNEL); + vmem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL); + } + +#define ASYNC_REQ_MAX (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS) + vmem->async_req_max = ASYNC_REQ_MAX; + vmem->async_req_nr = 0; + vmem->async_req = kzalloc(sizeof(*vmem->async_req), GFP_KERNEL); + +#define SYNC_REQ_MAX (KVM_MAX_VCPUS) + vmem->sync_req_max = round_up(SYNC_REQ_MAX, BITS_PER_LONG); + sync_bitmap_bytes = sizeof(unsigned long) * + (vmem->sync_req_max / BITS_PER_LONG); + vmem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); + vmem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); + vmem->page_wait = kzalloc(sizeof(*vmem->page_wait) * + vmem->sync_req_max, GFP_KERNEL); + for (i = 0; i < vmem->sync_req_max; ++i) + init_waitqueue_head(&vmem->page_wait[i]); + vmem->sync_req = kzalloc(sizeof(*vmem->sync_req) * + vmem->sync_req_max, GFP_KERNEL); + + vmem->req_list_nr = 0; + INIT_LIST_HEAD(&vmem->req_list); + init_waitqueue_head(&vmem->req_list_wait); + + init_waitqueue_head(&vmem->ready_wait); + vmem->ready = false; + + return 0; + + out: + kvm_vmem_free(vmem); + return error; +} + +static long kvm_vmem_dev_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *) arg; + long ret; + + switch (ioctl) { + case KVM_CREATE_VMEM: { + struct kvm_vmem_create create; + if (copy_from_user(&create, argp, sizeof(create))) { + ret = -EFAULT; + break; + } + ret = kvm_create_vmem(&create); + if (copy_to_user(argp, &create, sizeof(create))) { + ret = -EFAULT; + break; + } + break; + } + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int kvm_vmem_dev_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static struct file_operations kvm_vmem_dev_fops = { + .release = kvm_vmem_dev_release, + .unlocked_ioctl = kvm_vmem_dev_ioctl, +}; + +long kvm_dev_ioctl_create_vmem_dev(void) +{ + return anon_inode_getfd("kvm-vmem-dev", &kvm_vmem_dev_fops, + NULL, O_RDWR); +} diff --git a/virt/kvm/vmem.h b/virt/kvm/vmem.h new file mode 100644 index 0000000..bc7e8cf --- /dev/null +++ b/virt/kvm/vmem.h @@ -0,0 +1,68 @@ +/* + * KVM post copy vmem + * + * Copyright (c) 2011, + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#ifndef __KVM_VMEM_H__ +#define __KVM_VMEM_H__ + +struct kvm_vmem_page_req_list { + struct list_head list; + pgoff_t pgoff; +}; + +struct kvm_vmem { + loff_t size; + pgoff_t pgoff_end; + spinlock_t lock; + + wait_queue_head_t req_wait; + + int async_req_max; + int async_req_nr; + pgoff_t *async_req; + + int sync_req_max; + unsigned long *sync_req_bitmap; + unsigned long *sync_wait_bitmap; + pgoff_t *sync_req; + wait_queue_head_t *page_wait; + + int req_list_nr; + struct list_head req_list; + wait_queue_head_t req_list_wait; + + unsigned long *cached; + unsigned long *faulted; + + bool mmapped; + unsigned long vm_start; + unsigned int vma_nr; + struct task_struct *task; + + wait_queue_head_t ready_wait; + bool ready; + + struct file *shmem_filp; + struct vm_area_struct *vma; +}; + +#endif /* __KVM_VMEM_H__ */