Message ID | 20230727072410.135743-2-jing2.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Support dynamic MSI-X allocation | expand |
Hello Jing, On 7/27/23 09:24, Jing Liu wrote: > From: Reinette Chatre <reinette.chatre@intel.com> > > Kernel provides the guidance of dynamic MSI-X allocation support of > passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to > guide user space. > > Fetch and store the flags from host for later use to determine if > specific flags are set. > > Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> > Signed-off-by: Jing Liu <jing2.liu@intel.com> > --- > hw/vfio/pci.c | 12 ++++++++++++ > hw/vfio/pci.h | 1 + > hw/vfio/trace-events | 2 ++ > 3 files changed, 15 insertions(+) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index a205c6b1130f..0c4ac0873d40 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) > > static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) > { > + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; > int ret; > Error *err = NULL; > > @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) > memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); > } > > + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > + if (ret) { > + /* This can fail for an old kernel or legacy PCI dev */ > + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno)); Is it possible to detect the error reported by a kernel (< 6.4) when dynamic MSI-X are not supported. Looking at vfio_pci_ioctl_get_irq_info() in the kernel, could info.flags be tested against VFIO_IRQ_INFO_NORESIZE ? In that case, QEMU should report an error and the trace event is not needed. Thanks, C. > + } else { > + vdev->msix->irq_info_flags = irq_info.flags; > + } > + trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name, > + vdev->msix->irq_info_flags); > + > return 0; > } > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index a2771b9ff3cc..ad34ec56d0ae 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { > uint32_t table_offset; > uint32_t pba_offset; > unsigned long *pending; > + uint32_t irq_info_flags; > } VFIOMSIXInfo; > > #define TYPE_VFIO_PCI "vfio-pci" > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index ee7509e68e4f..7d4a398f044d 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, @0x%x, > vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, @0x%x, 0x%x, len=0x%x)" > vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x" > vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d" > +vfio_msix_setup_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" > +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X irq info flags 0x%x" > vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap" > vfio_check_pm_reset(const char *name) "%s Supports PM reset" > vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
On Thu, 27 Jul 2023 03:24:08 -0400 Jing Liu <jing2.liu@intel.com> wrote: > From: Reinette Chatre <reinette.chatre@intel.com> > > Kernel provides the guidance of dynamic MSI-X allocation support of > passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to > guide user space. > > Fetch and store the flags from host for later use to determine if > specific flags are set. > > Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> > Signed-off-by: Jing Liu <jing2.liu@intel.com> > --- > hw/vfio/pci.c | 12 ++++++++++++ > hw/vfio/pci.h | 1 + > hw/vfio/trace-events | 2 ++ > 3 files changed, 15 insertions(+) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index a205c6b1130f..0c4ac0873d40 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) > > static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) > { > + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; > int ret; > Error *err = NULL; > > @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) > memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); > } > > + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > + if (ret) { > + /* This can fail for an old kernel or legacy PCI dev */ > + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno)); We only call vfio_msix_setup() if the device has an MSI-X capability, so the "legacy PCI" portion of this comment seems unjustified. Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also question the "old kernel" part of this comment. We don't currently sanity test the device exposed MSI-X info versus that reported by GET_IRQ_INFO, but it seems valid to do so. I'd expect this to happen in vfio_msix_early_setup() though, especially since that's where the remainder of VFIOMSIXInfo is setup. > + } else { > + vdev->msix->irq_info_flags = irq_info.flags; > + } > + trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name, > + vdev->msix->irq_info_flags); > + > return 0; > } > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index a2771b9ff3cc..ad34ec56d0ae 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { > uint32_t table_offset; > uint32_t pba_offset; > unsigned long *pending; > + uint32_t irq_info_flags; Why not simply pull out a "noresize" bool? Thanks, Alex > } VFIOMSIXInfo; > > #define TYPE_VFIO_PCI "vfio-pci" > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index ee7509e68e4f..7d4a398f044d 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, @0x%x, > vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, @0x%x, 0x%x, len=0x%x)" > vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x" > vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d" > +vfio_msix_setup_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" > +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X irq info flags 0x%x" > vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap" > vfio_check_pm_reset(const char *name) "%s Supports PM reset" > vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
Hi Alex, Thanks very much for reviewing the patches. > On July 28, 2023 1:25 AM, Alex Williamson <alex.williamson@redhat.com> wrote: > > On Thu, 27 Jul 2023 03:24:08 -0400 > Jing Liu <jing2.liu@intel.com> wrote: > > > From: Reinette Chatre <reinette.chatre@intel.com> > > > > Kernel provides the guidance of dynamic MSI-X allocation support of > > passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to > > guide user space. > > > > Fetch and store the flags from host for later use to determine if > > specific flags are set. > > > > Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> > > Signed-off-by: Jing Liu <jing2.liu@intel.com> > > --- > > hw/vfio/pci.c | 12 ++++++++++++ > > hw/vfio/pci.h | 1 + > > hw/vfio/trace-events | 2 ++ > > 3 files changed, 15 insertions(+) > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index > > a205c6b1130f..0c4ac0873d40 100644 > > --- a/hw/vfio/pci.c > > +++ b/hw/vfio/pci.c > > @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice > > *vdev, Error **errp) > > > > static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error > > **errp) { > > + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; > > int ret; > > Error *err = NULL; > > > > @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int > pos, Error **errp) > > memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); > > } > > > > + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; > > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > > + if (ret) { > > + /* This can fail for an old kernel or legacy PCI dev */ > > + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno)); > > We only call vfio_msix_setup() if the device has an MSI-X capability, so the > "legacy PCI" portion of this comment seems unjustified. > Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also question the > "old kernel" part of this comment. Oh, yes, I just realize that only VFIO_PCI_ERR_IRQ_INDEX and VFIO_PCI_REQ_IRQ_INDEX were added later in include/uapi/linux/vfio.h. Thus, this ioctl() with MSIX index would not fail by the old-kernel or legacy-PCI reason. Thanks for pointing out this to me. We don't currently sanity test the device > exposed MSI-X info versus that reported by GET_IRQ_INFO, but it seems valid to > do so. Do we want to keep the check of possible failure from kernel (e.g., -EFAULT) and report the error code back to caller? Maybe like this, static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) { .... msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); if (ret < 0) { error_setg_errno(errp, -ret, "failed to get MSI-X IRQ INFO"); return; } else { vdev->msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE); } ... trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, msix->table_offset, msix->entries, vdev->msix->noresize); .... } > I'd expect this to happen in vfio_msix_early_setup() though, especially > since that's where the remainder of VFIOMSIXInfo is setup. > > > + } else { > > + vdev->msix->irq_info_flags = irq_info.flags; > > + } > > + trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name, > > + vdev->msix->irq_info_flags); > > + > > return 0; > > } > > > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index > > a2771b9ff3cc..ad34ec56d0ae 100644 > > --- a/hw/vfio/pci.h > > +++ b/hw/vfio/pci.h > > @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { > > uint32_t table_offset; > > uint32_t pba_offset; > > unsigned long *pending; > > + uint32_t irq_info_flags; > > Why not simply pull out a "noresize" bool? Thanks, > Will change to a bool type. Thanks, Jing > Alex > > > } VFIOMSIXInfo; > > > > #define TYPE_VFIO_PCI "vfio-pci" > > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index > > ee7509e68e4f..7d4a398f044d 100644 > > --- a/hw/vfio/trace-events > > +++ b/hw/vfio/trace-events > > @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int > > len, int val) " (%s, @0x%x, vfio_pci_write_config(const char *name, int addr, int > val, int len) " (%s, @0x%x, 0x%x, len=0x%x)" > > vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x" > > vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int > entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d" > > +vfio_msix_setup_get_irq_info_failure(const char *errstr) > "VFIO_DEVICE_GET_IRQ_INFO failure: %s" > > +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X > irq info flags 0x%x" > > vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap" > > vfio_check_pm_reset(const char *name) "%s Supports PM reset" > > vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
On 7/28/23 10:09, Liu, Jing2 wrote: > Hi Alex, > > Thanks very much for reviewing the patches. > >> On July 28, 2023 1:25 AM, Alex Williamson <alex.williamson@redhat.com> wrote: >> >> On Thu, 27 Jul 2023 03:24:08 -0400 >> Jing Liu <jing2.liu@intel.com> wrote: >> >>> From: Reinette Chatre <reinette.chatre@intel.com> >>> >>> Kernel provides the guidance of dynamic MSI-X allocation support of >>> passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to >>> guide user space. >>> >>> Fetch and store the flags from host for later use to determine if >>> specific flags are set. >>> >>> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> >>> Signed-off-by: Jing Liu <jing2.liu@intel.com> >>> --- >>> hw/vfio/pci.c | 12 ++++++++++++ >>> hw/vfio/pci.h | 1 + >>> hw/vfio/trace-events | 2 ++ >>> 3 files changed, 15 insertions(+) >>> >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index >>> a205c6b1130f..0c4ac0873d40 100644 >>> --- a/hw/vfio/pci.c >>> +++ b/hw/vfio/pci.c >>> @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice >>> *vdev, Error **errp) >>> >>> static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error >>> **errp) { >>> + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; >>> int ret; >>> Error *err = NULL; >>> >>> @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int >> pos, Error **errp) >>> memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); >>> } >>> >>> + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; >>> + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); >>> + if (ret) { >>> + /* This can fail for an old kernel or legacy PCI dev */ >>> + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno)); >> >> We only call vfio_msix_setup() if the device has an MSI-X capability, so the >> "legacy PCI" portion of this comment seems unjustified. >> Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also question the >> "old kernel" part of this comment. > > Oh, yes, I just realize that only VFIO_PCI_ERR_IRQ_INDEX and > VFIO_PCI_REQ_IRQ_INDEX were added later in include/uapi/linux/vfio.h. Thus, > this ioctl() with MSIX index would not fail by the old-kernel or legacy-PCI reason. > Thanks for pointing out this to me. > > We don't currently sanity test the device >> exposed MSI-X info versus that reported by GET_IRQ_INFO, but it seems valid to >> do so. > > Do we want to keep the check of possible failure from kernel (e.g., -EFAULT) and report > the error code back to caller? Maybe like this, > > static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) > { > .... > msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; > > ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > if (ret < 0) { > error_setg_errno(errp, -ret, "failed to get MSI-X IRQ INFO"); > return; > } else { > vdev->msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE); > } > ... > trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, > msix->table_offset, msix->entries, vdev->msix->noresize); In the trace event, please ouput irq_info.flags since it gives more information on the value returned by the kernel. > .... > } > >> I'd expect this to happen in vfio_msix_early_setup() though, especially >> since that's where the remainder of VFIOMSIXInfo is setup. > >> >>> + } else { >>> + vdev->msix->irq_info_flags = irq_info.flags; >>> + } >>> + trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name, >>> + vdev->msix->irq_info_flags); >>> + >>> return 0; >>> } >>> >>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index >>> a2771b9ff3cc..ad34ec56d0ae 100644 >>> --- a/hw/vfio/pci.h >>> +++ b/hw/vfio/pci.h >>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { >>> uint32_t table_offset; >>> uint32_t pba_offset; >>> unsigned long *pending; >>> + uint32_t irq_info_flags; >> >> Why not simply pull out a "noresize" bool? Thanks, >> > Will change to a bool type. I would simply cache the KVM flags value under VFIOMSIXInfo as you did and add an helper. Both work the same but the intial proposal keeps more information. This is minor. Thanks, C. > > Thanks, > Jing > >> Alex >> >>> } VFIOMSIXInfo; >>> >>> #define TYPE_VFIO_PCI "vfio-pci" >>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index >>> ee7509e68e4f..7d4a398f044d 100644 >>> --- a/hw/vfio/trace-events >>> +++ b/hw/vfio/trace-events >>> @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int >>> len, int val) " (%s, @0x%x, vfio_pci_write_config(const char *name, int addr, int >> val, int len) " (%s, @0x%x, 0x%x, len=0x%x)" >>> vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x" >>> vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int >> entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d" >>> +vfio_msix_setup_get_irq_info_failure(const char *errstr) >> "VFIO_DEVICE_GET_IRQ_INFO failure: %s" >>> +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X >> irq info flags 0x%x" >>> vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap" >>> vfio_check_pm_reset(const char *name) "%s Supports PM reset" >>> vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap" >
Hi C., Thanks very much for reviewing the patches. > On July 28, 2023 12:58 AM, Cédric Le Goater <clg@redhat.com> wrote: > > Hello Jing, > > On 7/27/23 09:24, Jing Liu wrote: > > From: Reinette Chatre <reinette.chatre@intel.com> > > > > Kernel provides the guidance of dynamic MSI-X allocation support of > > passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to > > guide user space. > > > > Fetch and store the flags from host for later use to determine if > > specific flags are set. > > > > Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> > > Signed-off-by: Jing Liu <jing2.liu@intel.com> > > --- > > hw/vfio/pci.c | 12 ++++++++++++ > > hw/vfio/pci.h | 1 + > > hw/vfio/trace-events | 2 ++ > > 3 files changed, 15 insertions(+) > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index > > a205c6b1130f..0c4ac0873d40 100644 > > --- a/hw/vfio/pci.c > > +++ b/hw/vfio/pci.c > > @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice > > *vdev, Error **errp) > > > > static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) > > { > > + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; > > int ret; > > Error *err = NULL; > > > > @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int > pos, Error **errp) > > memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); > > } > > > > + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; > > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > > + if (ret) { > > + /* This can fail for an old kernel or legacy PCI dev */ > > + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno)); > > Is it possible to detect the error reported by a kernel (< 6.4) when dynamic MSI-X > are not supported. I just realized that ioctl(VFIO_DEVICE_GET_IRQ_INFO) with VFIO_PCI_MSIX_IRQ_INDEX should always exists and would not cause failure for older kernel reason, after reading Alex's comment on this patch. (If my understanding is correct) So the possible failure might be other reason except old kernel or legacy PCI dev. Looking at vfio_pci_ioctl_get_irq_info() in the kernel, could > info.flags be tested against VFIO_IRQ_INFO_NORESIZE ? > Sorry I didn't quite understand "info.flags be tested against VFIO_IRQ_INFO_NORESIZE". I saw kernel < 6.4 simply added NORESIZE to info.flags and latest kernel adds if has_dyn_msix. Would you please kindly describe more on your point? > In that case, QEMU should report an error and the trace event is not needed. I replied an email with new error handling draft code based on my understanding, which reports the error and need no trace. Could you please help review if that is what we want? Thanks, Jing > > Thanks, > > C. > > > + } else { > > + vdev->msix->irq_info_flags = irq_info.flags; > > + } > > + trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name, > > + vdev->msix->irq_info_flags); > > + > > return 0; > > } > > > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index > > a2771b9ff3cc..ad34ec56d0ae 100644 > > --- a/hw/vfio/pci.h > > +++ b/hw/vfio/pci.h > > @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { > > uint32_t table_offset; > > uint32_t pba_offset; > > unsigned long *pending; > > + uint32_t irq_info_flags; > > } VFIOMSIXInfo; > > > > #define TYPE_VFIO_PCI "vfio-pci" > > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index > > ee7509e68e4f..7d4a398f044d 100644 > > --- a/hw/vfio/trace-events > > +++ b/hw/vfio/trace-events > > @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int len, > int val) " (%s, @0x%x, > > vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, > @0x%x, 0x%x, len=0x%x)" > > vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x" > > vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int > entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d" > > +vfio_msix_setup_get_irq_info_failure(const char *errstr) > "VFIO_DEVICE_GET_IRQ_INFO failure: %s" > > +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X > irq info flags 0x%x" > > vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap" > > vfio_check_pm_reset(const char *name) "%s Supports PM reset" > > vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
[ ... ] > Sorry I didn't quite understand "info.flags be tested against VFIO_IRQ_INFO_NORESIZE". > I saw kernel < 6.4 simply added NORESIZE to info.flags and latest kernel adds if has_dyn_msix. > Would you please kindly describe more on your point? I was trying to find the conditions to detect safely that the kernel didn't have dynamic MSI-X support. Testing VFIO_IRQ_INFO_NORESIZE seems enough. >> In that case, QEMU should report an error and the trace event is not needed. > > I replied an email with new error handling draft code based on my understanding, which > reports the error and need no trace. Could you please help review if that is what we want? yes. It looked good. Please send a v1 ! Thanks, Cédric.
On Fri, 28 Jul 2023 10:27:17 +0200 Cédric Le Goater <clg@redhat.com> wrote: > On 7/28/23 10:09, Liu, Jing2 wrote: > > Hi Alex, > > > > Thanks very much for reviewing the patches. > > > >> On July 28, 2023 1:25 AM, Alex Williamson <alex.williamson@redhat.com> wrote: > >> > >> On Thu, 27 Jul 2023 03:24:08 -0400 > >> Jing Liu <jing2.liu@intel.com> wrote: > >> > >>> From: Reinette Chatre <reinette.chatre@intel.com> > >>> > >>> Kernel provides the guidance of dynamic MSI-X allocation support of > >>> passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to > >>> guide user space. > >>> > >>> Fetch and store the flags from host for later use to determine if > >>> specific flags are set. > >>> > >>> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> > >>> Signed-off-by: Jing Liu <jing2.liu@intel.com> > >>> --- > >>> hw/vfio/pci.c | 12 ++++++++++++ > >>> hw/vfio/pci.h | 1 + > >>> hw/vfio/trace-events | 2 ++ > >>> 3 files changed, 15 insertions(+) > >>> > >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index > >>> a205c6b1130f..0c4ac0873d40 100644 > >>> --- a/hw/vfio/pci.c > >>> +++ b/hw/vfio/pci.c > >>> @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice > >>> *vdev, Error **errp) > >>> > >>> static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error > >>> **errp) { > >>> + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; > >>> int ret; > >>> Error *err = NULL; > >>> > >>> @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int > >> pos, Error **errp) > >>> memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); > >>> } > >>> > >>> + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; > >>> + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > >>> + if (ret) { > >>> + /* This can fail for an old kernel or legacy PCI dev */ > >>> + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno)); > >> > >> We only call vfio_msix_setup() if the device has an MSI-X capability, so the > >> "legacy PCI" portion of this comment seems unjustified. > >> Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also question the > >> "old kernel" part of this comment. > > > > Oh, yes, I just realize that only VFIO_PCI_ERR_IRQ_INDEX and > > VFIO_PCI_REQ_IRQ_INDEX were added later in include/uapi/linux/vfio.h. Thus, > > this ioctl() with MSIX index would not fail by the old-kernel or legacy-PCI reason. > > Thanks for pointing out this to me. > > > > We don't currently sanity test the device > >> exposed MSI-X info versus that reported by GET_IRQ_INFO, but it seems valid to > >> do so. > > > > Do we want to keep the check of possible failure from kernel (e.g., -EFAULT) and report > > the error code back to caller? Maybe like this, > > > > static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) > > { > > .... > > msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; > > > > ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > > if (ret < 0) { > > error_setg_errno(errp, -ret, "failed to get MSI-X IRQ INFO"); > > return; Yes. > > } else { > > vdev->msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE); > > } No 'else' required here since the error branch returns. > > ... > > trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, > > msix->table_offset, msix->entries, vdev->msix->noresize); > > In the trace event, please ouput irq_info.flags since it gives more > information on the value returned by the kernel. > > > .... > > } > > > >> I'd expect this to happen in vfio_msix_early_setup() though, especially > >> since that's where the remainder of VFIOMSIXInfo is setup. > > > >> > >>> + } else { > >>> + vdev->msix->irq_info_flags = irq_info.flags; > >>> + } > >>> + trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name, > >>> + vdev->msix->irq_info_flags); > >>> + > >>> return 0; > >>> } > >>> > >>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index > >>> a2771b9ff3cc..ad34ec56d0ae 100644 > >>> --- a/hw/vfio/pci.h > >>> +++ b/hw/vfio/pci.h > >>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { > >>> uint32_t table_offset; > >>> uint32_t pba_offset; > >>> unsigned long *pending; > >>> + uint32_t irq_info_flags; > >> > >> Why not simply pull out a "noresize" bool? Thanks, > >> > > Will change to a bool type. > > I would simply cache the KVM flags value under VFIOMSIXInfo as you > did and add an helper. Both work the same but the intial proposal > keeps more information. This is minor. TBH, I'd still prefer that we only store the one field we care about and avoid an extra helper, regardless of how simple it might be. Even if we eventually add masking for MSI-X, we can store it in less space and more accessibly decoded in the VFIOMSIXInfo struct vs helpers to access a cached flags value. Thanks, Alex
[ ... ] >>>>> --- a/hw/vfio/pci.h >>>>> +++ b/hw/vfio/pci.h >>>>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { >>>>> uint32_t table_offset; >>>>> uint32_t pba_offset; >>>>> unsigned long *pending; >>>>> + uint32_t irq_info_flags; >>>> >>>> Why not simply pull out a "noresize" bool? Thanks, >>>> >>> Will change to a bool type. >> >> I would simply cache the KVM flags value under VFIOMSIXInfo as you >> did and add an helper. Both work the same but the intial proposal >> keeps more information. This is minor. > > TBH, I'd still prefer that we only store the one field we care about > and avoid an extra helper, regardless of how simple it might be. Even > if we eventually add masking for MSI-X, we can store it in less space > and more accessibly decoded in the VFIOMSIXInfo struct vs helpers to > access a cached flags value. Thanks, Let's use a bool then. np. Thanks, C.
Hi Alex, > On July 28, 2023 11:41 PM, Alex Williamson <alex.williamson@redhat.com> wrote: > > On Fri, 28 Jul 2023 10:27:17 +0200 > Cédric Le Goater <clg@redhat.com> wrote: > > > On 7/28/23 10:09, Liu, Jing2 wrote: > > > Hi Alex, > > > > > > Thanks very much for reviewing the patches. > > > > > >> On July 28, 2023 1:25 AM, Alex Williamson > <alex.williamson@redhat.com> wrote: > > >> > > >> On Thu, 27 Jul 2023 03:24:08 -0400 > > >> Jing Liu <jing2.liu@intel.com> wrote: > > >> > > >>> From: Reinette Chatre <reinette.chatre@intel.com> > > >>> > > >>> Kernel provides the guidance of dynamic MSI-X allocation support > > >>> of passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag > > >>> to guide user space. > > >>> > > >>> Fetch and store the flags from host for later use to determine if > > >>> specific flags are set. > > >>> > > >>> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> > > >>> Signed-off-by: Jing Liu <jing2.liu@intel.com> > > >>> --- > > >>> hw/vfio/pci.c | 12 ++++++++++++ > > >>> hw/vfio/pci.h | 1 + > > >>> hw/vfio/trace-events | 2 ++ > > >>> 3 files changed, 15 insertions(+) > > >>> > > >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index > > >>> a205c6b1130f..0c4ac0873d40 100644 > > >>> --- a/hw/vfio/pci.c > > >>> +++ b/hw/vfio/pci.c > > >>> @@ -1572,6 +1572,7 @@ static void > > >>> vfio_msix_early_setup(VFIOPCIDevice > > >>> *vdev, Error **errp) > > >>> > > >>> static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error > > >>> **errp) { > > >>> + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) > > >>> + }; > > >>> int ret; > > >>> Error *err = NULL; > > >>> > > >>> @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice > > >>> *vdev, int > > >> pos, Error **errp) > > >>> memory_region_set_enabled(&vdev->pdev.msix_table_mmio, > false); > > >>> } > > >>> > > >>> + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; > > >>> + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, > &irq_info); > > >>> + if (ret) { > > >>> + /* This can fail for an old kernel or legacy PCI dev */ > > >>> + > > >>> + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno)); > > >> > > >> We only call vfio_msix_setup() if the device has an MSI-X > > >> capability, so the "legacy PCI" portion of this comment seems unjustified. > > >> Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also > > >> question the "old kernel" part of this comment. > > > > > > Oh, yes, I just realize that only VFIO_PCI_ERR_IRQ_INDEX and > > > VFIO_PCI_REQ_IRQ_INDEX were added later in > > > include/uapi/linux/vfio.h. Thus, this ioctl() with MSIX index would not fail by > the old-kernel or legacy-PCI reason. > > > Thanks for pointing out this to me. > > > > > > We don't currently sanity test the device > > >> exposed MSI-X info versus that reported by GET_IRQ_INFO, but it > > >> seems valid to do so. > > > > > > Do we want to keep the check of possible failure from kernel (e.g., > > > -EFAULT) and report the error code back to caller? Maybe like this, > > > > > > static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) > > > { > > > .... > > > msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; > > > > > > ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > > > if (ret < 0) { > > > error_setg_errno(errp, -ret, "failed to get MSI-X IRQ INFO"); > > > return; > > Yes. > > > > } else { > > > vdev->msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE); > > > } > > No 'else' required here since the error branch returns. Oh, yes. Will remove the "else" and just set the ”noresize“ value in next version. > > > > ... > > > trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix- > >table_bar, > > > msix->table_offset, msix->entries, > > > vdev->msix->noresize); > > > > In the trace event, please ouput irq_info.flags since it gives more > > information on the value returned by the kernel. > > > > > .... > > > } > > > > > >> I'd expect this to happen in vfio_msix_early_setup() though, > > >> especially since that's where the remainder of VFIOMSIXInfo is setup. > > > > > >> > > >>> + } else { > > >>> + vdev->msix->irq_info_flags = irq_info.flags; > > >>> + } > > >>> + trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name, > > >>> + > > >>> + vdev->msix->irq_info_flags); > > >>> + > > >>> return 0; > > >>> } > > >>> > > >>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index > > >>> a2771b9ff3cc..ad34ec56d0ae 100644 > > >>> --- a/hw/vfio/pci.h > > >>> +++ b/hw/vfio/pci.h > > >>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { > > >>> uint32_t table_offset; > > >>> uint32_t pba_offset; > > >>> unsigned long *pending; > > >>> + uint32_t irq_info_flags; > > >> > > >> Why not simply pull out a "noresize" bool? Thanks, > > >> > > > Will change to a bool type. > > > > I would simply cache the KVM flags value under VFIOMSIXInfo as you did > > and add an helper. Both work the same but the intial proposal keeps > > more information. This is minor. > > TBH, I'd still prefer that we only store the one field we care about and avoid an > extra helper, regardless of how simple it might be. Even if we eventually add > masking for MSI-X, we can store it in less space and more accessibly decoded in > the VFIOMSIXInfo struct vs helpers to access a cached flags value. Thanks, > Got it, thanks, Jing > Alex
Hi C. > On July 28, 2023 4:44 PM, Cédric Le Goater <clg@redhat.com> wrote: > > [ ... ] > > > Sorry I didn't quite understand "info.flags be tested against > VFIO_IRQ_INFO_NORESIZE". > > I saw kernel < 6.4 simply added NORESIZE to info.flags and latest kernel adds > if has_dyn_msix. > > Would you please kindly describe more on your point? > > I was trying to find the conditions to detect safely that the kernel didn't have > dynamic MSI-X support. Testing VFIO_IRQ_INFO_NORESIZE seems enough. > Oh, I see. > >> In that case, QEMU should report an error and the trace event is not > needed. > > > > I replied an email with new error handling draft code based on my > > understanding, which reports the error and need no trace. Could you please > help review if that is what we want? > > yes. It looked good. Please send a v1 ! Thanks for reviewing that. I guess you mean v2 for next version 😊 Jing > > Thanks, > > Cédric.
On 7/31/23 05:57, Liu, Jing2 wrote: > Hi C. > >> On July 28, 2023 4:44 PM, Cédric Le Goater <clg@redhat.com> wrote: >> >> [ ... ] >> >>> Sorry I didn't quite understand "info.flags be tested against >> VFIO_IRQ_INFO_NORESIZE". >>> I saw kernel < 6.4 simply added NORESIZE to info.flags and latest kernel adds >> if has_dyn_msix. >>> Would you please kindly describe more on your point? >> >> I was trying to find the conditions to detect safely that the kernel didn't have >> dynamic MSI-X support. Testing VFIO_IRQ_INFO_NORESIZE seems enough. >> > Oh, I see. > >>>> In that case, QEMU should report an error and the trace event is not >> needed. >>> >>> I replied an email with new error handling draft code based on my >>> understanding, which reports the error and need no trace. Could you please >> help review if that is what we want? >> >> yes. It looked good. Please send a v1 ! > > Thanks for reviewing that. I guess you mean v2 for next version 😊 Well, if you remove the RFC status, I think you should, this would still be a v1. Thanks, C.
Hi C. > On July 31, 2023 3:26 PM, Cédric Le Goater <clg@redhat.com> wrote: > > On 7/31/23 05:57, Liu, Jing2 wrote: > > Hi C. > > > >> On July 28, 2023 4:44 PM, Cédric Le Goater <clg@redhat.com> wrote: > >> > >> [ ... ] > >> > >>> Sorry I didn't quite understand "info.flags be tested against > >> VFIO_IRQ_INFO_NORESIZE". > >>> I saw kernel < 6.4 simply added NORESIZE to info.flags and latest > >>> kernel adds > >> if has_dyn_msix. > >>> Would you please kindly describe more on your point? > >> > >> I was trying to find the conditions to detect safely that the kernel > >> didn't have dynamic MSI-X support. Testing VFIO_IRQ_INFO_NORESIZE > seems enough. > >> > > Oh, I see. > > > >>>> In that case, QEMU should report an error and the trace event is > >>>> not > >> needed. > >>> > >>> I replied an email with new error handling draft code based on my > >>> understanding, which reports the error and need no trace. Could you > >>> please > >> help review if that is what we want? > >> > >> yes. It looked good. Please send a v1 ! > > > > Thanks for reviewing that. I guess you mean v2 for next version 😊 > > Well, if you remove the RFC status, I think you should, this would still be a v1. > Oh, got it. Thanks for telling this. Jing > Thanks, > > C.
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index a205c6b1130f..0c4ac0873d40 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) { + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; int ret; Error *err = NULL; @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); } + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + if (ret) { + /* This can fail for an old kernel or legacy PCI dev */ + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno)); + } else { + vdev->msix->irq_info_flags = irq_info.flags; + } + trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name, + vdev->msix->irq_info_flags); + return 0; } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index a2771b9ff3cc..ad34ec56d0ae 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { uint32_t table_offset; uint32_t pba_offset; unsigned long *pending; + uint32_t irq_info_flags; } VFIOMSIXInfo; #define TYPE_VFIO_PCI "vfio-pci" diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index ee7509e68e4f..7d4a398f044d 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, @0x%x, vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, @0x%x, 0x%x, len=0x%x)" vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x" vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d" +vfio_msix_setup_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X irq info flags 0x%x" vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap" vfio_check_pm_reset(const char *name) "%s Supports PM reset" vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"