From patchwork Mon Jan 11 17:23:32 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Michael S. Tsirkin" X-Patchwork-Id: 42646 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [199.232.76.165]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 7CE01B6ED0 for ; Tue, 12 Jan 2010 04:51:47 +1100 (EST) Received: from localhost ([127.0.0.1]:47847 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1NUONX-0002Jc-S3 for incoming@patchwork.ozlabs.org; Mon, 11 Jan 2010 12:48:35 -0500 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1NUO2F-0005eK-N8 for qemu-devel@nongnu.org; Mon, 11 Jan 2010 12:26:35 -0500 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1NUO2A-0005d8-N2 for qemu-devel@nongnu.org; Mon, 11 Jan 2010 12:26:34 -0500 Received: from [199.232.76.173] (port=34540 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1NUO2A-0005d4-FE for qemu-devel@nongnu.org; Mon, 11 Jan 2010 12:26:30 -0500 Received: from mx1.redhat.com ([209.132.183.28]:26253) by monty-python.gnu.org with esmtp (Exim 4.60) (envelope-from ) id 1NUO29-0001Mh-R1 for qemu-devel@nongnu.org; Mon, 11 Jan 2010 12:26:30 -0500 Received: from int-mx03.intmail.prod.int.phx2.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.16]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id o0BHQSAI023825 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Mon, 11 Jan 2010 12:26:28 -0500 Received: from redhat.com (vpn1-7-216.ams2.redhat.com [10.36.7.216]) by int-mx03.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with SMTP id o0BHQP7M026963; Mon, 11 Jan 2010 12:26:26 -0500 Date: Mon, 11 Jan 2010 19:23:32 +0200 From: "Michael S. Tsirkin" To: Anthony Liguori , qemu-devel@nongnu.org Message-ID: <20100111172332.GC12084@redhat.com> References: MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.19 (2009-01-05) X-Scanned-By: MIMEDefang 2.67 on 10.5.11.16 X-detected-operating-system: by monty-python.gnu.org: Genre and OS details not recognized. Cc: Subject: [Qemu-devel] [PATCH-RFC 11/13] vhost net support X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org This adds vhost net support in qemu. Will be tied to tap device and virtio later. Raw backend is currently missing, will be worked on/submitted separately. Signed-off-by: Michael S. Tsirkin --- Makefile.target | 1 + hw/vhost.c | 349 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ hw/vhost.h | 33 +++++ hw/vhost_net.c | 145 +++++++++++++++++++++++ hw/vhost_net.h | 20 +++ 5 files changed, 548 insertions(+), 0 deletions(-) create mode 100644 hw/vhost.c create mode 100644 hw/vhost.h create mode 100644 hw/vhost_net.c create mode 100644 hw/vhost_net.h diff --git a/Makefile.target b/Makefile.target index 7c1f30c..61b7148 100644 --- a/Makefile.target +++ b/Makefile.target @@ -157,6 +157,7 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o # virtio has to be here due to weird dependency between PCI and virtio-net. # need to fix this properly obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o virtio-pci.o +obj-y += vhost_net.o vhost.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o obj-$(CONFIG_ISA_MMIO) += isa_mmio.o LIBS+=-lz diff --git a/hw/vhost.c b/hw/vhost.c new file mode 100644 index 0000000..d23d94c --- /dev/null +++ b/hw/vhost.c @@ -0,0 +1,349 @@ +#include "linux/vhost.h" +#include +#include +#include "vhost.h" +#include "hw/hw.h" +/* For range_get_last */ +#include "pci.h" + +static void vhost_dev_unassign_memory(struct vhost_dev *dev, + struct vhost_memory *mem, + uint64_t start_addr, + uint64_t size) +{ + int from, to; + for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) { + struct vhost_memory_region *reg = mem->regions + to; + uint64_t reglast; + uint64_t memlast; + uint64_t change; + + /* clone old region */ + memcpy(reg, dev->mem->regions + from, sizeof *reg); + + /* No overlap is simple */ + if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size, + start_addr, size)) { + continue; + } + reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); + memlast = range_get_last(start_addr, size); + + /* Remove whole region */ + if (start_addr <= reg->guest_phys_addr && memlast >= reglast) { + --to; + continue; + } + + /* Shrink region */ + if (memlast >= reglast) { + reg->memory_size = start_addr - reg->guest_phys_addr; + continue; + } + + /* Shift region */ + if (start_addr <= reg->guest_phys_addr) { + change = memlast + 1 - reg->guest_phys_addr; + reg->memory_size -= change; + reg->guest_phys_addr += change; + reg->userspace_addr += change; + continue; + } + + /* Split region: shrink first part, shift second part. */ + memcpy(reg + 1, reg, sizeof *reg); + reg[0].memory_size = start_addr - reg->guest_phys_addr; + change = memlast + 1 - reg->guest_phys_addr; + reg[1].memory_size -= change; + reg[1].guest_phys_addr += change; + reg[1].userspace_addr += change; + ++to; + } + mem->nregions = to; +} + +/* Called after unassign, so no regions overlap the given range. */ +static void vhost_dev_assign_memory(struct vhost_dev *dev, + struct vhost_memory *mem, + uint64_t start_addr, + uint64_t size, + uint64_t uaddr) +{ + int from, to; + struct vhost_memory_region *merged = NULL; + for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) { + struct vhost_memory_region *reg = mem->regions + to; + uint64_t prlast, urlast; + uint64_t pmlast, umlast; + uint64_t s, e, u; + + /* clone old region */ + memcpy(reg, dev->mem->regions + from, sizeof *reg); + prlast = range_get_last(reg->guest_phys_addr, reg->memory_size); + pmlast = range_get_last(start_addr, size); + urlast = range_get_last(reg->userspace_addr, reg->memory_size); + umlast = range_get_last(uaddr, size); + + /* Not an adjecent region - do not merge. */ + if ((prlast + 1 != start_addr || urlast + 1 != uaddr) && + (pmlast + 1 != reg->guest_phys_addr || + umlast + 1 != reg->userspace_addr)) { + continue; + } + + if (!merged) { + --to; + } else { + merged = reg; + } + u = MIN(uaddr, reg->userspace_addr); + s = MIN(start_addr, reg->guest_phys_addr); + e = MAX(pmlast, prlast); + uaddr = merged->userspace_addr = u; + start_addr = merged->guest_phys_addr = s; + size = merged->memory_size = e - s + 1; + } + + if (!merged) { + struct vhost_memory_region *reg = mem->regions + to; + reg->memory_size = size; + reg->guest_phys_addr = start_addr; + reg->userspace_addr = uaddr; + ++to; + } + mem->nregions = to; +} + +static void vhost_client_set_memory(CPUPhysMemoryClient *client, + target_phys_addr_t start_addr, + ram_addr_t size, + ram_addr_t phys_offset) +{ + struct vhost_dev *dev = container_of(client, struct vhost_dev, client); + ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK; + int s = offsetof(struct vhost_memory, regions) + + (dev->mem->nregions + 1)* sizeof dev->mem->regions[0]; + struct vhost_memory *mem = qemu_malloc(s); + memcpy(mem, dev->mem, s); + + /* First, remove old mapping for this memory, if any. */ + vhost_dev_unassign_memory(dev, mem, start_addr, size); + if (flags == IO_MEM_RAM) { + /* Add given mapping, merging adjacent regions if any */ + vhost_dev_assign_memory(dev, mem, start_addr, size, + (uintptr_t)qemu_get_ram_ptr(phys_offset)); + } + qemu_free(dev->mem); + dev->mem = mem; +} + +static int vhost_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client, + target_phys_addr_t start_addr, + target_phys_addr_t end_addr) +{ + /* TODO: migration */ + return 0; +} + +static int vhost_client_migration_log(struct CPUPhysMemoryClient *client, + int enable) +{ + /* TODO: migration */ + return 0; +} + +static int vhost_virtqueue_init(struct vhost_dev *dev, + struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, + struct VirtQueue *q, + unsigned idx) +{ + target_phys_addr_t s, l; + int r; + struct vhost_vring_addr addr = { + .index = idx, + }; + struct vhost_vring_file file = { + .index = idx, + }; + struct vhost_vring_state size = { + .index = idx, + }; + + size.num = q->vring.num; + r = ioctl(dev->control, VHOST_SET_VRING_NUM, &size); + if (r) { + return -errno; + } + + file.fd = vq->kick = eventfd(0, 0); + r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); + if (r) { + return -errno; + } + + file.fd = vq->call = eventfd(0, 0); + r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file); + if (r) { + r = -errno; + goto fail_call; + } + + s = l = sizeof(struct vring_desc) * q->vring.num; + vq->desc = cpu_physical_memory_map(q->vring.desc, &l, 0); + if (!vq->desc || l != s) { + r = -ENOMEM; + goto fail_alloc; + } + s = l = offsetof(struct vring_avail, ring) + + sizeof(u_int64_t) * q->vring.num; + vq->avail = cpu_physical_memory_map(q->vring.avail, &l, 0); + if (!vq->avail || l != s) { + r = -ENOMEM; + goto fail_alloc; + } + s = l = offsetof(struct vring_used, ring) + + sizeof(struct vring_used_elem) * q->vring.num; + vq->used = cpu_physical_memory_map(q->vring.used, &l, 1); + if (!vq->used || l != s) { + r = -ENOMEM; + goto fail_alloc; + } + + addr.desc_user_addr = (u_int64_t)(unsigned long)vq->desc; + addr.avail_user_addr = (u_int64_t)(unsigned long)vq->avail; + addr.used_user_addr = (u_int64_t)(unsigned long)vq->used; + r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); + if (r < 0) { + r = -errno; + goto fail_alloc; + } + if (!vdev->binding->set_irqfd || !vdev->binding->set_queuefd) { + fprintf(stderr, "binding does not support irqfd/queuefd\n"); + r = -ENOSYS; + goto fail_alloc; + } + r = vdev->binding->set_irqfd(vdev->binding_opaque, idx, vq->call, true); + if (r < 0) { + goto fail_alloc; + } + + r = vdev->binding->set_queuefd(vdev->binding_opaque, idx, vq->kick, true); + if (r < 0) { + goto fail_queuefd; + } + return 0; + +fail_queuefd: + vdev->binding->set_irqfd(vdev->binding_opaque, idx, vq->call, false); +fail_alloc: + close(vq->call); +fail_call: + close(vq->kick); + return r; +} + +static void vhost_virtqueue_cleanup(struct vhost_dev *dev, + struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, + struct VirtQueue *q, + unsigned idx) +{ + int r; + r = vdev->binding->set_irqfd(vdev->binding_opaque, idx, vq->call, false); + if (r < 0) { + fprintf(stderr, "VQ cleanup failed: %d\n", r); + } + + r = vdev->binding->set_queuefd(vdev->binding_opaque, idx, vq->kick, false); + if (r < 0) { + fprintf(stderr, "VQ cleanup failed: %d\n", r); + } +} + +int vhost_dev_init(struct vhost_dev *hdev, int devfd) +{ + uint64_t features; + int r; + if (devfd >= 0) { + hdev->control = devfd; + } else { + hdev->control = open("/dev/vhost-net", O_RDWR); + if (hdev->control < 0) + return -errno; + } + r = ioctl(hdev->control, VHOST_SET_OWNER, NULL); + if (r < 0) + goto fail; + + r = ioctl(hdev->control, VHOST_GET_FEATURES, &features); + if (r < 0) + goto fail; + hdev->features = features; + + hdev->client.set_memory = vhost_client_set_memory; + hdev->client.sync_dirty_bitmap = vhost_client_sync_dirty_bitmap; + hdev->client.migration_log = vhost_client_migration_log; + hdev->mem = qemu_mallocz(offsetof(struct vhost_memory, regions)); + cpu_register_phys_memory_client(&hdev->client); + return 0; +fail: + r = -errno; + close(hdev->control); + return r; +} + +void vhost_dev_cleanup(struct vhost_dev *hdev) +{ + cpu_unregister_phys_memory_client(&hdev->client); + qemu_free(hdev->mem); + close(hdev->control); +} + +int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int i, r; + + r = ioctl(hdev->control, VHOST_SET_FEATURES, &hdev->acked_features); + if (r < 0) + goto fail; + r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem); + if (r < 0) + goto fail; + + for (i = 0; i < hdev->nvqs; ++i) { + r = vhost_virtqueue_init(hdev, + vdev, + hdev->vqs + i, + vdev->vq + i, + i); + if (r < 0) + goto fail_vq; + } + + return 0; +fail_vq: + while (--i >= 0) { + vhost_virtqueue_cleanup(hdev, + vdev, + hdev->vqs + i, + vdev->vq + i, + i); + } +fail: + return r; +} + +void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int i; + + for (i = 0; i < hdev->nvqs; ++i) { + vhost_virtqueue_cleanup(hdev, + vdev, + hdev->vqs + i, + vdev->vq + i, + i); + } +} + diff --git a/hw/vhost.h b/hw/vhost.h new file mode 100644 index 0000000..9f82b42 --- /dev/null +++ b/hw/vhost.h @@ -0,0 +1,33 @@ +#ifndef VHOST_H +#define VHOST_H + +#include "hw/hw.h" +#include "hw/virtio.h" + +/* Generic structures common for any vhost based device. */ +struct vhost_virtqueue { + int kick; + int call; + void *desc; + void *avail; + void *used; +}; + +struct vhost_memory; +struct vhost_dev { + CPUPhysMemoryClient client; + int control; + struct vhost_memory *mem; + struct vhost_virtqueue *vqs; + int nvqs; + unsigned long long features; + unsigned long long acked_features; + unsigned long long backend_features; +}; + +int vhost_dev_init(struct vhost_dev *hdev, int devfd); +void vhost_dev_cleanup(struct vhost_dev *hdev); +int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev); +void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev); + +#endif diff --git a/hw/vhost_net.c b/hw/vhost_net.c new file mode 100644 index 0000000..e2c97c0 --- /dev/null +++ b/hw/vhost_net.c @@ -0,0 +1,145 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "net.h" +#include "net/tap.h" + +#include "virtio-net.h" +#include "vhost.h" +#include "vhost_net.h" + +struct vhost_net { + struct vhost_dev dev; + struct vhost_virtqueue vqs[2]; + int backend; + VLANClientState *vc; +}; + +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) +{ + /* Clear features not supported by host kernel. */ + if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))) + features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY); + if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC))) + features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); + if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF))) + features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); + return features; +} + +void vhost_net_ack_features(struct vhost_net *net, unsigned features) +{ + net->dev.acked_features = net->dev.backend_features; + if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) + net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY); + if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC)) + net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC); +} + +static int vhost_net_get_fd(VLANClientState *backend) +{ + switch (backend->info->type) { + case NET_CLIENT_TYPE_TAP: + return tap_get_fd(backend); + default: + fprintf(stderr, "vhost-net requires tap backend\n"); + return -EBADFD; + } +} + +struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd) +{ + int r; + struct vhost_net *net = qemu_malloc(sizeof *net); + if (!backend) { + fprintf(stderr, "vhost-net requires backend to be setup\n"); + goto fail; + } + r = vhost_net_get_fd(backend); + if (r < 0) + goto fail; + net->vc = backend; + net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 : + (1 << VHOST_NET_F_VIRTIO_NET_HDR); + net->backend = r; + + r = vhost_dev_init(&net->dev, devfd); + if (r < 0) + goto fail; + if (~net->dev.features & net->dev.backend_features) { + fprintf(stderr, "vhost lacks feature mask %llu for backend\n", + ~net->dev.features & net->dev.backend_features); + vhost_dev_cleanup(&net->dev); + goto fail; + } + + /* Set sane init value. Override when guest acks. */ + vhost_net_ack_features(net, 0); + return net; +fail: + qemu_free(net); + return NULL; +} + +int vhost_net_start(struct vhost_net *net, + VirtIODevice *dev) +{ + struct vhost_vring_file file = { }; + int r; + + net->dev.nvqs = 2; + net->dev.vqs = net->vqs; + r = vhost_dev_start(&net->dev, dev); + if (r < 0) + return r; + + net->vc->info->poll(net->vc, false); + qemu_set_fd_handler(net->backend, NULL, NULL, NULL); + file.fd = net->backend; + for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { + r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + if (r < 0) { + r = -errno; + goto fail; + } + } + return 0; +fail: + file.fd = -1; + while (--file.index >= 0) { + ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + } + net->vc->info->poll(net->vc, true); + vhost_dev_stop(&net->dev, dev); + return r; +} + +void vhost_net_stop(struct vhost_net *net, + VirtIODevice *dev) +{ + struct vhost_vring_file file = { .fd = -1 }; + + for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { + ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + } + net->vc->info->poll(net->vc, true); + vhost_dev_stop(&net->dev, dev); +} + +void vhost_net_cleanup(struct vhost_net *net) +{ + vhost_dev_cleanup(&net->dev); + qemu_free(net); +} +/* TODO: log */ diff --git a/hw/vhost_net.h b/hw/vhost_net.h new file mode 100644 index 0000000..21f0277 --- /dev/null +++ b/hw/vhost_net.h @@ -0,0 +1,20 @@ +#ifndef VHOST_NET_H +#define VHOST_NET_H + +#include "net.h" + +struct vhost_net; + +struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd); + +int vhost_net_start(struct vhost_net *net, + VirtIODevice *dev); +void vhost_net_stop(struct vhost_net *net, + VirtIODevice *dev); + +void vhost_net_cleanup(struct vhost_net *net); + +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features); +void vhost_net_ack_features(struct vhost_net *net, unsigned features); + +#endif