Message ID | 1466495274-5011-17-git-send-email-peterx@redhat.com |
---|---|
State | New |
Headers | show |
On 2016-06-21 09:47, Peter Xu wrote: > In split irqchip mode, IOAPIC is working in user space, only update > kernel irq routes when entry changed. When IR is enabled, we directly > update the kernel with translated messages. It works just like a kernel > cache for the remapping entries. > > Since KVM irqfd is using kernel gsi routes to deliver interrupts, as > long as we can support split irqchip, we will support irqfd as > well. Also, since kernel gsi routes will cache translated interrupts, > irqfd delivery will not suffer from any performance impact due to IR. > > And, since we supported irqfd, vhost devices will be able to work > seamlessly with IR now. Logically this should contain both vhost-net and > vhost-user case. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > hw/i386/intel_iommu.c | 7 +++++++ > include/hw/i386/intel_iommu.h | 1 + > include/hw/i386/x86-iommu.h | 4 ++++ > target-i386/kvm.c | 27 +++++++++++++++++++++++++++ > trace-events | 3 +++ > 5 files changed, 42 insertions(+) > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index d874596..0eaffc6 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -2149,6 +2149,12 @@ do_not_translate: > return 0; > } > > +static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, > + MSIMessage *dst, uint16_t sid) > +{ > + return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), src, dst); > +} > + > static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, > uint64_t *data, unsigned size, > MemTxAttrs attrs) > @@ -2393,6 +2399,7 @@ static void vtd_class_init(ObjectClass *klass, void *data) > dc->props = vtd_properties; > x86_class->realize = vtd_realize; > x86_class->find_add_as = vtd_find_add_as; > + x86_class->int_remap = vtd_int_remap; > } > > static const TypeInfo vtd_info = { > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h > index b3f17d7..3bca390 100644 > --- a/include/hw/i386/intel_iommu.h > +++ b/include/hw/i386/intel_iommu.h > @@ -26,6 +26,7 @@ > #include "hw/i386/x86-iommu.h" > #include "hw/i386/ioapic.h" > #include "hw/pci/msi.h" > +#include "hw/sysbus.h" > > #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" > #define INTEL_IOMMU_DEVICE(obj) \ > diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h > index 07199be..b419ae5 100644 > --- a/include/hw/i386/x86-iommu.h > +++ b/include/hw/i386/x86-iommu.h > @@ -22,6 +22,7 @@ > > #include "hw/sysbus.h" > #include "exec/memory.h" > +#include "hw/pci/pci.h" > > #define TYPE_X86_IOMMU_DEVICE ("x86-iommu") > #define X86_IOMMU_DEVICE(obj) \ > @@ -43,6 +44,9 @@ struct X86IOMMUClass { > DeviceRealize realize; > /* Find/Add IOMMU address space for specific PCI device */ > AddressSpace *(*find_add_as)(X86IOMMUState *s, PCIBus *bus, int devfn); > + /* MSI-based interrupt remapping */ > + int (*int_remap)(X86IOMMUState *iommu, MSIMessage *src, > + MSIMessage *dst, uint16_t sid); > }; > > struct X86IOMMUState { > diff --git a/target-i386/kvm.c b/target-i386/kvm.c > index f3698f1..bfa40b2 100644 > --- a/target-i386/kvm.c > +++ b/target-i386/kvm.c > @@ -35,6 +35,7 @@ > #include "hw/i386/apic.h" > #include "hw/i386/apic_internal.h" > #include "hw/i386/apic-msidef.h" > +#include "hw/i386/intel_iommu.h" > > #include "exec/ioport.h" > #include "standard-headers/asm-x86/hyperv.h" > @@ -42,6 +43,7 @@ > #include "hw/pci/msi.h" > #include "migration/migration.h" > #include "exec/memattrs.h" > +#include "trace.h" > > //#define DEBUG_KVM > > @@ -3323,6 +3325,31 @@ int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id) > int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, > uint64_t address, uint32_t data, PCIDevice *dev) > { > + X86IOMMUState *iommu = x86_iommu_get_default(); > + > + if (iommu) { > + int ret; > + MSIMessage src, dst; > + X86IOMMUClass *class = X86_IOMMU_GET_CLASS(iommu); > + > + src.address = route->u.msi.address_hi; > + src.address <<= VTD_MSI_ADDR_HI_SHIFT; > + src.address |= route->u.msi.address_lo; > + src.data = route->u.msi.data; > + > + ret = class->int_remap(iommu, &src, &dst, dev ? \ > + pci_requester_id(dev) : \ > + X86_IOMMU_SID_INVALID); > + if (ret) { > + trace_kvm_x86_fixup_msi_error(route->gsi); > + return 1; > + } > + > + route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT; > + route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK; > + route->u.msi.data = dst.data; > + } > + > return 0; > } > > diff --git a/trace-events b/trace-events > index da0d060..2982f64 100644 > --- a/trace-events > +++ b/trace-events > @@ -2206,3 +2206,6 @@ gicv3_redist_write(uint32_t cpu, uint64_t offset, uint64_t data, unsigned size, > gicv3_redist_badwrite(uint32_t cpu, uint64_t offset, uint64_t data, unsigned size, bool secure) "GICv3 redistributor %x write: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u secure %d: error" > gicv3_redist_set_irq(uint32_t cpu, int irq, int level) "GICv3 redistributor %x interrupt %d level changed to %d" > gicv3_redist_send_sgi(uint32_t cpu, int irq) "GICv3 redistributor %x pending SGI %d" > + > +# target-i386/kvm.c > +kvm_x86_fixup_msi_error(uint32_t gsi) "VT-d failed to remap interrupt for GSI %" PRIu32 > For successful remappings, this is fine - it just caches the result in an interrupt route. But what will happen with invalid interrupts? My current understanding is that, because the translation happens on activation of that interrupt source, not on actual signalling, the IOMMU will report an error too early and none when the interrupt is actually sent. That will lead to unwanted results, in the worst case false-positiv IR error reports to the guest, no? I think we need to do this: - silently remap broken sources to an error sink - hook up the error sink with the actual IOMMU model (Intel or AMD) - when that source actually fires, let the sink report an IR translation error to the guest Am I right? Jan
On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: [...] > For successful remappings, this is fine - it just caches the result in > an interrupt route. But what will happen with invalid interrupts? > > My current understanding is that, because the translation happens on > activation of that interrupt source, not on actual signalling, the IOMMU > will report an error too early and none when the interrupt is actually > sent. That will lead to unwanted results, in the worst case > false-positiv IR error reports to the guest, no? > > I think we need to do this: > - silently remap broken sources to an error sink > - hook up the error sink with the actual IOMMU model (Intel or AMD) > - when that source actually fires, let the sink report an IR > translation error to the guest > > Am I right? Right. I totally missed this one. :( Currently when split irqchip is specified, IOAPIC interrupts are cached in kernel with type KVM_IRQ_ROUTING_MSI (which is the same as irqfds). When guest specify a fault interrupt entry, it is possible that we silently fail the update, and all further interrupts are still the old and correct one. I agree with your solution on this. First of all we update the interrupt even if it's faulty, but we should mark it out. After that, we should fire QEMU from kernel side when the fault interrupt is triggered, so that QEMU IOMMU can still generate corresponding fault report interrupt to guest (though for Intel IOMMU IR, we still haven't handled any fault report yet, but we should be prepared for it). So it seems that finally we cannot avoid touching KVM this time. I have a thought on how to implement the "sink" you have mentioned: First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe called: KVM_IRQ_ROUTING_EVENTFD When KVM got this kind of interrupt, KVM does not trigger any real interrupt to guest. Instead, it just do eventfd_signal() to a pre-defined fd (maybe also with some data along with the notification, so that we can put the error inside?), which is set during KVM_SET_GSI_ROUTING ioctl(). After that, QEMU register all fault interrupts using this new KVM_IRQ_ROUTING_EVENTFD type (rather than original KVM_IRQ_ROUTING_MSI), assign a specific handler to handle the events from these interrupts, and trigger IOMMU fault report path in that handler. (Here I used KVM_IRQ_ROUTING_EVENTFD rather than something like KVM_IRQ_ROUTING_FAULT_MSI to make the API a more general one, in case we can leverage it in other cases in the future) Do you think the above workable? No matter which solution we will have, I would still suggest we add this as an "enhancement" after this series, since: - there are works that depend on this series, so I would appreciate if this series can be merged first, so that other people can have a good basement (Radim's x2apic, David's AMD IOMMU). Though this is based on the assumption that the basic design of this series is workable... - this problem will only exist for guest driver developers and should not happen for generic users (right?), so only a small subset of users might be affected. Thanks, -- peterx
On 2016-06-25 15:18, Peter Xu wrote: > On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: > > [...] > >> For successful remappings, this is fine - it just caches the result in >> an interrupt route. But what will happen with invalid interrupts? >> >> My current understanding is that, because the translation happens on >> activation of that interrupt source, not on actual signalling, the IOMMU >> will report an error too early and none when the interrupt is actually >> sent. That will lead to unwanted results, in the worst case >> false-positiv IR error reports to the guest, no? >> >> I think we need to do this: >> - silently remap broken sources to an error sink >> - hook up the error sink with the actual IOMMU model (Intel or AMD) >> - when that source actually fires, let the sink report an IR >> translation error to the guest >> >> Am I right? > > Right. I totally missed this one. :( > > Currently when split irqchip is specified, IOAPIC interrupts are > cached in kernel with type KVM_IRQ_ROUTING_MSI (which is the same as > irqfds). When guest specify a fault interrupt entry, it is possible > that we silently fail the update, and all further interrupts are still > the old and correct one. > > I agree with your solution on this. First of all we update the > interrupt even if it's faulty, but we should mark it out. After that, > we should fire QEMU from kernel side when the fault interrupt is > triggered, so that QEMU IOMMU can still generate corresponding fault > report interrupt to guest (though for Intel IOMMU IR, we still haven't > handled any fault report yet, but we should be prepared for it). > > So it seems that finally we cannot avoid touching KVM this time. > > I have a thought on how to implement the "sink" you have mentioned: > > First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe > called: > > KVM_IRQ_ROUTING_EVENTFD Not really, because all sources are either using eventfds, which you can also terminate in user space (already done for vhost and vfio in certain scenarios - IIRC) or originate there anyway (IOAPIC). > > When KVM got this kind of interrupt, KVM does not trigger any real > interrupt to guest. Instead, it just do eventfd_signal() to a > pre-defined fd (maybe also with some data along with the notification, > so that we can put the error inside?), which is set during > KVM_SET_GSI_ROUTING ioctl(). > > After that, QEMU register all fault interrupts using this new > KVM_IRQ_ROUTING_EVENTFD type (rather than original > KVM_IRQ_ROUTING_MSI), assign a specific handler to handle the events > from these interrupts, and trigger IOMMU fault report path in that > handler. > > (Here I used KVM_IRQ_ROUTING_EVENTFD rather than something like > KVM_IRQ_ROUTING_FAULT_MSI to make the API a more general one, in case > we can leverage it in other cases in the future) > > Do you think the above workable? > > No matter which solution we will have, I would still suggest we add > this as an "enhancement" after this series, since: > > - there are works that depend on this series, so I would appreciate if > this series can be merged first, so that other people can have a > good basement (Radim's x2apic, David's AMD IOMMU). Though this is > based on the assumption that the basic design of this series is > workable... I understand, and it is probably safe... > > - this problem will only exist for guest driver developers and should > not happen for generic users (right?), so only a small subset of > users might be affected. ...provided there is only little risk that some guest programs some half-backed or stale message that would be rejected prematurely. But that risk is most likely low. Jan
On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote: > On 2016-06-25 15:18, Peter Xu wrote: > > On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: [...] > > I have a thought on how to implement the "sink" you have mentioned: > > > > First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe > > called: > > > > KVM_IRQ_ROUTING_EVENTFD > > Not really, because all sources are either using eventfds, which you can > also terminate in user space (already done for vhost and vfio in certain > scenarios - IIRC) or originate there anyway (IOAPIC). But how should we handle the cases when the interrupt path are all in kernel? For vhost, data should be transfered all inside kernel when split irqchip and irqfd are used: when vhost got data, it triggers irqfd to deliver the interrupt to KVM. Along the way, we should all in kernel. For vfio, we have vfio_msihandler() who handles the hardware IRQ and then triggers irqfd as well to KVM. Again, it seems all in kernel space, no chance to stop that as well. Please correct me if I was wrong. [...] > > - there are works that depend on this series, so I would appreciate if > > this series can be merged first, so that other people can have a > > good basement (Radim's x2apic, David's AMD IOMMU). Though this is > > based on the assumption that the basic design of this series is > > workable... > > I understand, and it is probably safe... > > > > > - this problem will only exist for guest driver developers and should > > not happen for generic users (right?), so only a small subset of > > users might be affected. > > ...provided there is only little risk that some guest programs some > half-backed or stale message that would be rejected prematurely. But > that risk is most likely low. Yes, thanks! -- peterx
On 2016-06-26 03:48, Peter Xu wrote: > On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote: >> On 2016-06-25 15:18, Peter Xu wrote: >>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: > > [...] > >>> I have a thought on how to implement the "sink" you have mentioned: >>> >>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe >>> called: >>> >>> KVM_IRQ_ROUTING_EVENTFD >> >> Not really, because all sources are either using eventfds, which you can >> also terminate in user space (already done for vhost and vfio in certain >> scenarios - IIRC) or originate there anyway (IOAPIC). > > But how should we handle the cases when the interrupt path are all in > kernel? There are none which we can't redirect (only full in-kernel irqchip would have, but that's unsupported anyway). > > For vhost, data should be transfered all inside kernel when split > irqchip and irqfd are used: when vhost got data, it triggers irqfd to > deliver the interrupt to KVM. Along the way, we should all in kernel. > > For vfio, we have vfio_msihandler() who handles the hardware IRQ and > then triggers irqfd as well to KVM. Again, it seems all in kernel > space, no chance to stop that as well. > > Please correct me if I was wrong. Look at what vhost is doing e.g.: when a virtqueue is masked, it installs an event notifier that records incoming events in a pending state field. When it's unmasked, the corresponding KVM irqfd is installed. Jan
On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote: > On 2016-06-26 03:48, Peter Xu wrote: > > On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote: > >> On 2016-06-25 15:18, Peter Xu wrote: > >>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: > > > > [...] > > > >>> I have a thought on how to implement the "sink" you have mentioned: > >>> > >>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe > >>> called: > >>> > >>> KVM_IRQ_ROUTING_EVENTFD > >> > >> Not really, because all sources are either using eventfds, which you can > >> also terminate in user space (already done for vhost and vfio in certain > >> scenarios - IIRC) or originate there anyway (IOAPIC). > > > > But how should we handle the cases when the interrupt path are all in > > kernel? > > There are none which we can't redirect (only full in-kernel irqchip > would have, but that's unsupported anyway). I agree but I kind of feel it's ok to work on this as a patch on top. Additionally, some kind of test would have to be written for these error cases, which is non-negligeable amount of worl. So I'm inlined to merge this patchset - I feel it'll help things make progress. Thoughts? Jan - if you agree it's a good idea, acks would be appreciated. > > > > For vhost, data should be transfered all inside kernel when split > > irqchip and irqfd are used: when vhost got data, it triggers irqfd to > > deliver the interrupt to KVM. Along the way, we should all in kernel. > > > > For vfio, we have vfio_msihandler() who handles the hardware IRQ and > > then triggers irqfd as well to KVM. Again, it seems all in kernel > > space, no chance to stop that as well. > > > > Please correct me if I was wrong. > > Look at what vhost is doing e.g.: when a virtqueue is masked, it > installs an event notifier that records incoming events in a pending > state field. When it's unmasked, the corresponding KVM irqfd is installed. > > Jan > >
On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote: > On 2016-06-26 03:48, Peter Xu wrote: > > On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote: > >> On 2016-06-25 15:18, Peter Xu wrote: > >>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: > > > > [...] > > > >>> I have a thought on how to implement the "sink" you have mentioned: > >>> > >>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe > >>> called: > >>> > >>> KVM_IRQ_ROUTING_EVENTFD > >> > >> Not really, because all sources are either using eventfds, which you can > >> also terminate in user space (already done for vhost and vfio in certain > >> scenarios - IIRC) or originate there anyway (IOAPIC). > > > > But how should we handle the cases when the interrupt path are all in > > kernel? > > There are none which we can't redirect (only full in-kernel irqchip > would have, but that's unsupported anyway). > > > > > For vhost, data should be transfered all inside kernel when split > > irqchip and irqfd are used: when vhost got data, it triggers irqfd to > > deliver the interrupt to KVM. Along the way, we should all in kernel. > > > > For vfio, we have vfio_msihandler() who handles the hardware IRQ and > > then triggers irqfd as well to KVM. Again, it seems all in kernel > > space, no chance to stop that as well. > > > > Please correct me if I was wrong. > > Look at what vhost is doing e.g.: when a virtqueue is masked, it > installs an event notifier that records incoming events in a pending > state field. When it's unmasked, the corresponding KVM irqfd is installed. You are right. Thanks for the explaination. -- peterx
On 21/06/2016 09:47, Peter Xu wrote: > @@ -3323,6 +3325,31 @@ int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id) > int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, > uint64_t address, uint32_t data, PCIDevice *dev) > { > + X86IOMMUState *iommu = x86_iommu_get_default(); > + > + if (iommu) { > + int ret; > + MSIMessage src, dst; > + X86IOMMUClass *class = X86_IOMMU_GET_CLASS(iommu); > + > + src.address = route->u.msi.address_hi; > + src.address <<= VTD_MSI_ADDR_HI_SHIFT; > + src.address |= route->u.msi.address_lo; > + src.data = route->u.msi.data; > + > + ret = class->int_remap(iommu, &src, &dst, dev ? \ > + pci_requester_id(dev) : \ > + X86_IOMMU_SID_INVALID); > + if (ret) { > + trace_kvm_x86_fixup_msi_error(route->gsi); > + return 1; > + } > + > + route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT; > + route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK; > + route->u.msi.data = dst.data; > + } > + > return 0; > } I don't like this particularly. Instead, I think the X86 IOMMU class should implement a new interface "MSIRemapper", and PCIBus should have a pointer to MSIRemapper*. Then this can become: if (dev) { PCIBus *bus = PCI_BUS(qdev_get_parent_bus(DEVICE(dev)); if (bus->remapper) { msi_remapper_fixup_route(bus->remapper, &src, &dst, pci_requester_id(dev)); } } That said, I'm okay with the patch as is because the issue is not with the x86 implementation but with kvm_arch_fixup_msi_route. S390 should be able to do the same, by implementing MSIRemapper in S390pciState (I think). Paolo
On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote: > On 2016-06-26 03:48, Peter Xu wrote: > > On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote: > >> On 2016-06-25 15:18, Peter Xu wrote: > >>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: > > > > [...] > > > >>> I have a thought on how to implement the "sink" you have mentioned: > >>> > >>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe > >>> called: > >>> > >>> KVM_IRQ_ROUTING_EVENTFD > >> > >> Not really, because all sources are either using eventfds, which you can > >> also terminate in user space (already done for vhost and vfio in certain > >> scenarios - IIRC) or originate there anyway (IOAPIC). > > > > But how should we handle the cases when the interrupt path are all in > > kernel? > > There are none which we can't redirect (only full in-kernel irqchip > would have, but that's unsupported anyway). > > > > > For vhost, data should be transfered all inside kernel when split > > irqchip and irqfd are used: when vhost got data, it triggers irqfd to > > deliver the interrupt to KVM. Along the way, we should all in kernel. > > > > For vfio, we have vfio_msihandler() who handles the hardware IRQ and > > then triggers irqfd as well to KVM. Again, it seems all in kernel > > space, no chance to stop that as well. > > > > Please correct me if I was wrong. > > Look at what vhost is doing e.g.: when a virtqueue is masked, it > installs an event notifier that records incoming events in a pending > state field. When it's unmasked, the corresponding KVM irqfd is installed. Hmm I think it's time I pick up this topic up again... :) Since it's been half a year from the last post of this thread (I believe this thread is the so-called "cold data" and should be stored on tapes already... and sorry fot the long delay), I'd like to do a quick summary on this: interrupt remap still cannot work well when we install fault interrupts - when that happens, we should inject VT-d fault, rather than keeping silence. The suggestion from Jan above should be a good solution that only need to touch qemu part - that's the most benefit AFAIU. However, OTOH IMO we need to modify all the kvm irqfd users with this fix (pci-assign, ioapic, ivshmem, vfio-pci, virtio) - we need to have all these devices init with an "fault sink" eventfd, then when we detected specific irqfd install error, we install the "fault sink". What's worse, if we add new devices with irqfd support, we need to implement the same error handling logic as well. Am I understanding it correctly? If so, isn't that awkward? Now I am re-thinking about my KVM_IRQ_ROUTING_EVENTFD proposal to do it - in that case, we should not need to worry about the users of kvm irqfd, and the error handling is done automatically even with new irqfd users coming in. The disadvantage is of course we need to touch both qemu and kvm, also we need to touch KVM API for it (though I think it'll only need very small change in KVM). And not sure whether that would worth it. Or, any better way to do it? Hope I didn't miss anything. Comments are welcomed! Regards, -- peterx
On 2017-01-03 07:15, Peter Xu wrote: > On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote: >> On 2016-06-26 03:48, Peter Xu wrote: >>> On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote: >>>> On 2016-06-25 15:18, Peter Xu wrote: >>>>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: >>> >>> [...] >>> >>>>> I have a thought on how to implement the "sink" you have mentioned: >>>>> >>>>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe >>>>> called: >>>>> >>>>> KVM_IRQ_ROUTING_EVENTFD >>>> >>>> Not really, because all sources are either using eventfds, which you can >>>> also terminate in user space (already done for vhost and vfio in certain >>>> scenarios - IIRC) or originate there anyway (IOAPIC). >>> >>> But how should we handle the cases when the interrupt path are all in >>> kernel? >> >> There are none which we can't redirect (only full in-kernel irqchip >> would have, but that's unsupported anyway). >> >>> >>> For vhost, data should be transfered all inside kernel when split >>> irqchip and irqfd are used: when vhost got data, it triggers irqfd to >>> deliver the interrupt to KVM. Along the way, we should all in kernel. >>> >>> For vfio, we have vfio_msihandler() who handles the hardware IRQ and >>> then triggers irqfd as well to KVM. Again, it seems all in kernel >>> space, no chance to stop that as well. >>> >>> Please correct me if I was wrong. >> >> Look at what vhost is doing e.g.: when a virtqueue is masked, it >> installs an event notifier that records incoming events in a pending >> state field. When it's unmasked, the corresponding KVM irqfd is installed. > > Hmm I think it's time I pick up this topic up again... :) > > Since it's been half a year from the last post of this thread (I > believe this thread is the so-called "cold data" and should be stored > on tapes already... and sorry fot the long delay), I'd like to do a > quick summary on this: interrupt remap still cannot work well when we > install fault interrupts - when that happens, we should inject VT-d > fault, rather than keeping silence. > > The suggestion from Jan above should be a good solution that only need > to touch qemu part - that's the most benefit AFAIU. However, OTOH IMO > we need to modify all the kvm irqfd users with this fix (pci-assign, > ioapic, ivshmem, vfio-pci, virtio) - we need to have all these devices > init with an "fault sink" eventfd, then when we detected specific > irqfd install error, we install the "fault sink". What's worse, if we > add new devices with irqfd support, we need to implement the same > error handling logic as well. Am I understanding it correctly? If so, > isn't that awkward? > > Now I am re-thinking about my KVM_IRQ_ROUTING_EVENTFD proposal to do > it - in that case, we should not need to worry about the users of kvm > irqfd, and the error handling is done automatically even with new > irqfd users coming in. The disadvantage is of course we need to touch > both qemu and kvm, also we need to touch KVM API for it (though I > think it'll only need very small change in KVM). And not sure whether > that would worth it. > > Or, any better way to do it? > > Hope I didn't miss anything. Comments are welcomed! > I don't have the details in mind again, but I suppose the only alternative to fixing a QEMU boilerplate code issue with new KVM kernel interface is abstracting the common patterns in QEMU that all the irqfd users share and solve solve that topic once. Might turn out, though, that the exiting kernel interface prevents this... Jan
On Wed, Jan 04, 2017 at 11:33:36AM +0100, Jan Kiszka wrote: > On 2017-01-03 07:15, Peter Xu wrote: > > On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote: > >> On 2016-06-26 03:48, Peter Xu wrote: > >>> On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote: > >>>> On 2016-06-25 15:18, Peter Xu wrote: > >>>>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote: > >>> > >>> [...] > >>> > >>>>> I have a thought on how to implement the "sink" you have mentioned: > >>>>> > >>>>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe > >>>>> called: > >>>>> > >>>>> KVM_IRQ_ROUTING_EVENTFD > >>>> > >>>> Not really, because all sources are either using eventfds, which you can > >>>> also terminate in user space (already done for vhost and vfio in certain > >>>> scenarios - IIRC) or originate there anyway (IOAPIC). > >>> > >>> But how should we handle the cases when the interrupt path are all in > >>> kernel? > >> > >> There are none which we can't redirect (only full in-kernel irqchip > >> would have, but that's unsupported anyway). > >> > >>> > >>> For vhost, data should be transfered all inside kernel when split > >>> irqchip and irqfd are used: when vhost got data, it triggers irqfd to > >>> deliver the interrupt to KVM. Along the way, we should all in kernel. > >>> > >>> For vfio, we have vfio_msihandler() who handles the hardware IRQ and > >>> then triggers irqfd as well to KVM. Again, it seems all in kernel > >>> space, no chance to stop that as well. > >>> > >>> Please correct me if I was wrong. > >> > >> Look at what vhost is doing e.g.: when a virtqueue is masked, it > >> installs an event notifier that records incoming events in a pending > >> state field. When it's unmasked, the corresponding KVM irqfd is installed. > > > > Hmm I think it's time I pick up this topic up again... :) > > > > Since it's been half a year from the last post of this thread (I > > believe this thread is the so-called "cold data" and should be stored > > on tapes already... and sorry fot the long delay), I'd like to do a > > quick summary on this: interrupt remap still cannot work well when we > > install fault interrupts - when that happens, we should inject VT-d > > fault, rather than keeping silence. > > > > The suggestion from Jan above should be a good solution that only need > > to touch qemu part - that's the most benefit AFAIU. However, OTOH IMO > > we need to modify all the kvm irqfd users with this fix (pci-assign, > > ioapic, ivshmem, vfio-pci, virtio) - we need to have all these devices > > init with an "fault sink" eventfd, then when we detected specific > > irqfd install error, we install the "fault sink". What's worse, if we > > add new devices with irqfd support, we need to implement the same > > error handling logic as well. Am I understanding it correctly? If so, > > isn't that awkward? > > > > Now I am re-thinking about my KVM_IRQ_ROUTING_EVENTFD proposal to do > > it - in that case, we should not need to worry about the users of kvm > > irqfd, and the error handling is done automatically even with new > > irqfd users coming in. The disadvantage is of course we need to touch > > both qemu and kvm, also we need to touch KVM API for it (though I > > think it'll only need very small change in KVM). And not sure whether > > that would worth it. > > > > Or, any better way to do it? > > > > Hope I didn't miss anything. Comments are welcomed! > > > > I don't have the details in mind again, but I suppose the only > alternative to fixing a QEMU boilerplate code issue with new KVM kernel > interface is abstracting the common patterns in QEMU that all the irqfd > users share and solve solve that topic once. Might turn out, though, > that the exiting kernel interface prevents this... Hmm, (after a quick glance) I was just afraid that I might need to touch lots of codes in QEMU even to provide such a common layer for this single fault tolerance feature. Then let me think it over again... Thanks Jan! -- peterx
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index d874596..0eaffc6 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2149,6 +2149,12 @@ do_not_translate: return 0; } +static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, + MSIMessage *dst, uint16_t sid) +{ + return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), src, dst); +} + static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, uint64_t *data, unsigned size, MemTxAttrs attrs) @@ -2393,6 +2399,7 @@ static void vtd_class_init(ObjectClass *klass, void *data) dc->props = vtd_properties; x86_class->realize = vtd_realize; x86_class->find_add_as = vtd_find_add_as; + x86_class->int_remap = vtd_int_remap; } static const TypeInfo vtd_info = { diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index b3f17d7..3bca390 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -26,6 +26,7 @@ #include "hw/i386/x86-iommu.h" #include "hw/i386/ioapic.h" #include "hw/pci/msi.h" +#include "hw/sysbus.h" #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" #define INTEL_IOMMU_DEVICE(obj) \ diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h index 07199be..b419ae5 100644 --- a/include/hw/i386/x86-iommu.h +++ b/include/hw/i386/x86-iommu.h @@ -22,6 +22,7 @@ #include "hw/sysbus.h" #include "exec/memory.h" +#include "hw/pci/pci.h" #define TYPE_X86_IOMMU_DEVICE ("x86-iommu") #define X86_IOMMU_DEVICE(obj) \ @@ -43,6 +44,9 @@ struct X86IOMMUClass { DeviceRealize realize; /* Find/Add IOMMU address space for specific PCI device */ AddressSpace *(*find_add_as)(X86IOMMUState *s, PCIBus *bus, int devfn); + /* MSI-based interrupt remapping */ + int (*int_remap)(X86IOMMUState *iommu, MSIMessage *src, + MSIMessage *dst, uint16_t sid); }; struct X86IOMMUState { diff --git a/target-i386/kvm.c b/target-i386/kvm.c index f3698f1..bfa40b2 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -35,6 +35,7 @@ #include "hw/i386/apic.h" #include "hw/i386/apic_internal.h" #include "hw/i386/apic-msidef.h" +#include "hw/i386/intel_iommu.h" #include "exec/ioport.h" #include "standard-headers/asm-x86/hyperv.h" @@ -42,6 +43,7 @@ #include "hw/pci/msi.h" #include "migration/migration.h" #include "exec/memattrs.h" +#include "trace.h" //#define DEBUG_KVM @@ -3323,6 +3325,31 @@ int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id) int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, uint64_t address, uint32_t data, PCIDevice *dev) { + X86IOMMUState *iommu = x86_iommu_get_default(); + + if (iommu) { + int ret; + MSIMessage src, dst; + X86IOMMUClass *class = X86_IOMMU_GET_CLASS(iommu); + + src.address = route->u.msi.address_hi; + src.address <<= VTD_MSI_ADDR_HI_SHIFT; + src.address |= route->u.msi.address_lo; + src.data = route->u.msi.data; + + ret = class->int_remap(iommu, &src, &dst, dev ? \ + pci_requester_id(dev) : \ + X86_IOMMU_SID_INVALID); + if (ret) { + trace_kvm_x86_fixup_msi_error(route->gsi); + return 1; + } + + route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT; + route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK; + route->u.msi.data = dst.data; + } + return 0; } diff --git a/trace-events b/trace-events index da0d060..2982f64 100644 --- a/trace-events +++ b/trace-events @@ -2206,3 +2206,6 @@ gicv3_redist_write(uint32_t cpu, uint64_t offset, uint64_t data, unsigned size, gicv3_redist_badwrite(uint32_t cpu, uint64_t offset, uint64_t data, unsigned size, bool secure) "GICv3 redistributor %x write: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u secure %d: error" gicv3_redist_set_irq(uint32_t cpu, int irq, int level) "GICv3 redistributor %x interrupt %d level changed to %d" gicv3_redist_send_sgi(uint32_t cpu, int irq) "GICv3 redistributor %x pending SGI %d" + +# target-i386/kvm.c +kvm_x86_fixup_msi_error(uint32_t gsi) "VT-d failed to remap interrupt for GSI %" PRIu32
In split irqchip mode, IOAPIC is working in user space, only update kernel irq routes when entry changed. When IR is enabled, we directly update the kernel with translated messages. It works just like a kernel cache for the remapping entries. Since KVM irqfd is using kernel gsi routes to deliver interrupts, as long as we can support split irqchip, we will support irqfd as well. Also, since kernel gsi routes will cache translated interrupts, irqfd delivery will not suffer from any performance impact due to IR. And, since we supported irqfd, vhost devices will be able to work seamlessly with IR now. Logically this should contain both vhost-net and vhost-user case. Signed-off-by: Peter Xu <peterx@redhat.com> --- hw/i386/intel_iommu.c | 7 +++++++ include/hw/i386/intel_iommu.h | 1 + include/hw/i386/x86-iommu.h | 4 ++++ target-i386/kvm.c | 27 +++++++++++++++++++++++++++ trace-events | 3 +++ 5 files changed, 42 insertions(+)