From patchwork Thu Oct 18 21:22:42 2012
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Alex Williamson <alex.williamson@redhat.com>
X-Patchwork-Id: 192472
Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from lists.gnu.org (lists.gnu.org [208.118.235.17])
	(using TLSv1 with cipher AES256-SHA (256/256 bits))
	(Client did not present a certificate)
	by ozlabs.org (Postfix) with ESMTPS id 33FD22C008F
	for <incoming@patchwork.ozlabs.org>;
	Fri, 19 Oct 2012 08:22:55 +1100 (EST)
Received: from localhost ([::1]:58472 helo=lists.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.71) (envelope-from
	<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>)
	id 1TOxYL-0000LQ-8m
	for incoming@patchwork.ozlabs.org; Thu, 18 Oct 2012 17:22:53 -0400
Received: from eggs.gnu.org ([208.118.235.92]:55097)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <alex.williamson@redhat.com>) id 1TOxYE-0000IO-Bl
	for qemu-devel@nongnu.org; Thu, 18 Oct 2012 17:22:47 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <alex.williamson@redhat.com>) id 1TOxYC-0001Xi-RH
	for qemu-devel@nongnu.org; Thu, 18 Oct 2012 17:22:46 -0400
Received: from mx1.redhat.com ([209.132.183.28]:65191)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <alex.williamson@redhat.com>) id 1TOxYC-0001Vr-Ht
	for qemu-devel@nongnu.org; Thu, 18 Oct 2012 17:22:44 -0400
Received: from int-mx11.intmail.prod.int.phx2.redhat.com
	(int-mx11.intmail.prod.int.phx2.redhat.com [10.5.11.24])
	by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id q9ILMhus020583
	(version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK);
	Thu, 18 Oct 2012 17:22:43 -0400
Received: from bling.home (ovpn-113-170.phx2.redhat.com [10.3.113.170])
	by int-mx11.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with
	ESMTP id q9ILMgKT032058; Thu, 18 Oct 2012 17:22:42 -0400
From: Alex Williamson <alex.williamson@redhat.com>
To: alex.williamson@redhat.com
Date: Thu, 18 Oct 2012 15:22:42 -0600
Message-ID: <20121018211643.1825.82696.stgit@bling.home>
User-Agent: StGIT/0.14.3
MIME-Version: 1.0
X-Scanned-By: MIMEDefang 2.68 on 10.5.11.24
X-detected-operating-system: by eggs.gnu.org: Genre and OS details not
	recognized.
X-Received-From: 209.132.183.28
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: [Qemu-devel] [RFC PATCH] vfio-pci: Add KVM INTx acceleration
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.14
Precedence: list
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

This makes use of the new level irqfd support enabling bypass of qemu
userspace both on INTx injection and unmask.  This significantly
boosts the performance of devices making use of legacy interrupts (ex.
~60% better netperf TCP_RR scores for an e1000e assigned to a Linux
guest and booted with pci=nomsi).  This also avoids flipping mmaps on
and off to simulate EOIs, so greatly improves performance of device
access in addition to interrupt latency.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

This patch depends on a pending linux header update as well as some
patches in Michael's tree, but I'm going to be offline for a couple
weeks and want to make it available for anyone to try once the
pre-reqs are available.  I intend to include this in a future pull
request for vfio-pci to get it in before the 1.3 hard freeze.
Comments welcome, though I might be slow to reply.  Thanks,

Alex

 hw/vfio_pci.c |  186 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index d5ff367..dc58cdc 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -185,6 +185,21 @@ static void vfio_unmask_intx(VFIODevice *vdev)
     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 }
 
+#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
+static void vfio_mask_intx(VFIODevice *vdev)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+        .index = VFIO_PCI_INTX_IRQ_INDEX,
+        .start = 0,
+        .count = 1,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+#endif
+
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
  * also be a huge overhead.  We try to get the best of both worlds by
@@ -248,6 +263,161 @@ static void vfio_eoi(VFIODevice *vdev)
     vfio_unmask_intx(vdev);
 }
 
+static void vfio_enable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    struct kvm_irqfd irqfd = {
+        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
+        .gsi = vdev->intx.route.irq,
+        .flags = KVM_IRQFD_FLAG_RESAMPLE,
+    };
+    struct vfio_irq_set *irq_set;
+    int ret, argsz;
+    int32_t *pfd;
+
+    if (!kvm_irqchip_in_kernel() ||
+        vdev->intx.route.mode != PCI_INTX_ENABLED ||
+        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
+        return;
+    }
+
+    /* Get to a known interrupt state */
+    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
+    vfio_mask_intx(vdev);
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+
+    /* Get an eventfd for resample/unmask */
+    if (event_notifier_init(&vdev->intx.unmask, 0)) {
+        error_report("vfio: Error: event_notifier_init failed eoi\n");
+        goto fail;
+    }
+
+    /* KVM triggers it, VFIO listens for it */
+    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
+
+    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
+        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
+        goto fail_irqfd;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
+    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = irqfd.resamplefd;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (ret) {
+        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
+        goto fail_vfio;
+    }
+
+    /* Let'em rip */
+    vfio_unmask_intx(vdev);
+
+    vdev->intx.kvm_accel = true;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
+            __func__, vdev->host.domain, vdev->host.bus,
+            vdev->host.slot, vdev->host.function);
+
+    return;
+
+fail_vfio:
+    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
+    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
+fail_irqfd:
+    event_notifier_cleanup(&vdev->intx.unmask);
+fail:
+    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
+    vfio_unmask_intx(vdev);
+#endif
+}
+
+static void vfio_disable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    struct kvm_irqfd irqfd = {
+        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
+        .gsi = vdev->intx.route.irq,
+        .flags = KVM_IRQFD_FLAG_DEASSIGN,
+    };
+
+    if (!vdev->intx.kvm_accel) {
+        return;
+    }
+
+    /*
+     * Get to a known state, hardware masked, QEMU ready to accept new
+     * interrupts, QEMU IRQ de-asserted.
+     */
+    vfio_mask_intx(vdev);
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+
+    /* Tell KVM to stop listening for an INTx irqfd */
+    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
+        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
+    }
+
+    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
+    event_notifier_cleanup(&vdev->intx.unmask);
+
+    /* QEMU starts listening for interrupt events. */
+    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
+
+    vdev->intx.kvm_accel = false;
+
+    /* If we've missed an event, let it re-fire through QEMU */
+    vfio_unmask_intx(vdev);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
+            __func__, vdev->host.domain, vdev->host.bus,
+            vdev->host.slot, vdev->host.function);
+#endif
+}
+
+static void vfio_update_irq(PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIINTxRoute route;
+
+    if (vdev->interrupt != VFIO_INT_INTx) {
+        return;
+    }
+
+    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
+
+    if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
+        return; /* Nothing changed */
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vdev->intx.route.irq, route.irq);
+
+    vfio_disable_intx_kvm(vdev);
+
+    vdev->intx.route = route;
+
+    if (route.mode != PCI_INTX_ENABLED) {
+        return;
+    }
+
+    vfio_enable_intx_kvm(vdev);
+
+    /* Re-enable the interrupt in cased we missed an EOI */
+    vfio_eoi(vdev);
+}
+
 static int vfio_enable_intx(VFIODevice *vdev)
 {
     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
@@ -262,6 +432,18 @@ static int vfio_enable_intx(VFIODevice *vdev)
     vfio_disable_interrupts(vdev);
 
     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+
+#ifdef CONFIG_KVM
+    /*
+     * Only conditional to avoid generating error messages on platforms
+     * where we won't actually use the result anyway.
+     */
+    if (kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
+        vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
+                                                        vdev->intx.pin);
+    }
+#endif
+
     ret = event_notifier_init(&vdev->intx.interrupt, 0);
     if (ret) {
         error_report("vfio: Error: event_notifier_init failed\n");
@@ -290,6 +472,8 @@ static int vfio_enable_intx(VFIODevice *vdev)
         return -errno;
     }
 
+    vfio_enable_intx_kvm(vdev);
+
     vdev->interrupt = VFIO_INT_INTx;
 
     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
@@ -303,6 +487,7 @@ static void vfio_disable_intx(VFIODevice *vdev)
     int fd;
 
     qemu_del_timer(vdev->intx.mmap_timer);
+    vfio_disable_intx_kvm(vdev);
     vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
     vdev->intx.pending = false;
     qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
@@ -1869,6 +2054,7 @@ static int vfio_initfn(PCIDevice *pdev)
     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
         vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
                                                   vfio_intx_mmap_enable, vdev);
+        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
         ret = vfio_enable_intx(vdev);
         if (ret) {
             goto out_teardown;