diff mbox series

[RFC,v1,1/3] vfio/pci: detect the support of dynamic MSI-X allocation

Message ID 20230727072410.135743-2-jing2.liu@intel.com
State New
Headers show
Series Support dynamic MSI-X allocation | expand

Commit Message

Jing Liu July 27, 2023, 7:24 a.m. UTC
From: Reinette Chatre <reinette.chatre@intel.com>

Kernel provides the guidance of dynamic MSI-X allocation support of
passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
guide user space.

Fetch and store the flags from host for later use to determine if
specific flags are set.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
---
 hw/vfio/pci.c        | 12 ++++++++++++
 hw/vfio/pci.h        |  1 +
 hw/vfio/trace-events |  2 ++
 3 files changed, 15 insertions(+)

Comments

Cédric Le Goater July 27, 2023, 4:58 p.m. UTC | #1
Hello Jing,

On 7/27/23 09:24, Jing Liu wrote:
> From: Reinette Chatre <reinette.chatre@intel.com>
> 
> Kernel provides the guidance of dynamic MSI-X allocation support of
> passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
> guide user space.
> 
> Fetch and store the flags from host for later use to determine if
> specific flags are set.
> 
> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
> Signed-off-by: Jing Liu <jing2.liu@intel.com>
> ---
>   hw/vfio/pci.c        | 12 ++++++++++++
>   hw/vfio/pci.h        |  1 +
>   hw/vfio/trace-events |  2 ++
>   3 files changed, 15 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a205c6b1130f..0c4ac0873d40 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>   
>   static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>   {
> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
>       int ret;
>       Error *err = NULL;
>   
> @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>           memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
>       }
>   
> +    irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> +    if (ret) {
> +        /* This can fail for an old kernel or legacy PCI dev */
> +        trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));

Is it possible to detect the error reported by a kernel (< 6.4) when
dynamic MSI-X are not supported. Looking at vfio_pci_ioctl_get_irq_info()
in the kernel, could info.flags be tested against VFIO_IRQ_INFO_NORESIZE ?

In that case, QEMU should report an error and the trace event is not
needed.

Thanks,

C.

> +    } else {
> +        vdev->msix->irq_info_flags = irq_info.flags;
> +    }
> +    trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
> +                                         vdev->msix->irq_info_flags);
> +
>       return 0;
>   }
>   
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index a2771b9ff3cc..ad34ec56d0ae 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
>       uint32_t table_offset;
>       uint32_t pba_offset;
>       unsigned long *pending;
> +    uint32_t irq_info_flags;
>   } VFIOMSIXInfo;
>   
>   #define TYPE_VFIO_PCI "vfio-pci"
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index ee7509e68e4f..7d4a398f044d 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, @0x%x,
>   vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, @0x%x, 0x%x, len=0x%x)"
>   vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
>   vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
> +vfio_msix_setup_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
> +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X irq info flags 0x%x"
>   vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
>   vfio_check_pm_reset(const char *name) "%s Supports PM reset"
>   vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
Alex Williamson July 27, 2023, 5:24 p.m. UTC | #2
On Thu, 27 Jul 2023 03:24:08 -0400
Jing Liu <jing2.liu@intel.com> wrote:

> From: Reinette Chatre <reinette.chatre@intel.com>
> 
> Kernel provides the guidance of dynamic MSI-X allocation support of
> passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
> guide user space.
> 
> Fetch and store the flags from host for later use to determine if
> specific flags are set.
> 
> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
> Signed-off-by: Jing Liu <jing2.liu@intel.com>
> ---
>  hw/vfio/pci.c        | 12 ++++++++++++
>  hw/vfio/pci.h        |  1 +
>  hw/vfio/trace-events |  2 ++
>  3 files changed, 15 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a205c6b1130f..0c4ac0873d40 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>  
>  static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>  {
> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
>      int ret;
>      Error *err = NULL;
>  
> @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>          memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
>      }
>  
> +    irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> +    if (ret) {
> +        /* This can fail for an old kernel or legacy PCI dev */
> +        trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));

We only call vfio_msix_setup() if the device has an MSI-X capability,
so the "legacy PCI" portion of this comment seems unjustified.
Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also
question the "old kernel" part of this comment.  We don't currently
sanity test the device exposed MSI-X info versus that reported by
GET_IRQ_INFO, but it seems valid to do so.  I'd expect this to happen
in vfio_msix_early_setup() though, especially since that's where the
remainder of VFIOMSIXInfo is setup.

> +    } else {
> +        vdev->msix->irq_info_flags = irq_info.flags;
> +    }
> +    trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
> +                                         vdev->msix->irq_info_flags);
> +
>      return 0;
>  }
>  
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index a2771b9ff3cc..ad34ec56d0ae 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
>      uint32_t table_offset;
>      uint32_t pba_offset;
>      unsigned long *pending;
> +    uint32_t irq_info_flags;

Why not simply pull out a "noresize" bool?  Thanks,

Alex

>  } VFIOMSIXInfo;
>  
>  #define TYPE_VFIO_PCI "vfio-pci"
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index ee7509e68e4f..7d4a398f044d 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, @0x%x,
>  vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, @0x%x, 0x%x, len=0x%x)"
>  vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
>  vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
> +vfio_msix_setup_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
> +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X irq info flags 0x%x"
>  vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
>  vfio_check_pm_reset(const char *name) "%s Supports PM reset"
>  vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
Jing Liu July 28, 2023, 8:09 a.m. UTC | #3
Hi Alex,

Thanks very much for reviewing the patches.

> On July 28, 2023 1:25 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Thu, 27 Jul 2023 03:24:08 -0400
> Jing Liu <jing2.liu@intel.com> wrote:
> 
> > From: Reinette Chatre <reinette.chatre@intel.com>
> >
> > Kernel provides the guidance of dynamic MSI-X allocation support of
> > passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
> > guide user space.
> >
> > Fetch and store the flags from host for later use to determine if
> > specific flags are set.
> >
> > Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
> > Signed-off-by: Jing Liu <jing2.liu@intel.com>
> > ---
> >  hw/vfio/pci.c        | 12 ++++++++++++
> >  hw/vfio/pci.h        |  1 +
> >  hw/vfio/trace-events |  2 ++
> >  3 files changed, 15 insertions(+)
> >
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index
> > a205c6b1130f..0c4ac0873d40 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice
> > *vdev, Error **errp)
> >
> >  static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error
> > **errp)  {
> > +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
> >      int ret;
> >      Error *err = NULL;
> >
> > @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int
> pos, Error **errp)
> >          memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
> >      }
> >
> > +    irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
> > +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> > +    if (ret) {
> > +        /* This can fail for an old kernel or legacy PCI dev */
> > +        trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));
> 
> We only call vfio_msix_setup() if the device has an MSI-X capability, so the
> "legacy PCI" portion of this comment seems unjustified.
> Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also question the
> "old kernel" part of this comment.  

Oh, yes, I just realize that only VFIO_PCI_ERR_IRQ_INDEX and 
VFIO_PCI_REQ_IRQ_INDEX were added later in include/uapi/linux/vfio.h. Thus,
this ioctl() with MSIX index would not fail by the old-kernel or legacy-PCI reason.
Thanks for pointing out this to me.

We don't currently sanity test the device
> exposed MSI-X info versus that reported by GET_IRQ_INFO, but it seems valid to
> do so.  

Do we want to keep the check of possible failure from kernel (e.g., -EFAULT) and report 
the error code back to caller? Maybe like this,

static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
{
    ....
    msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;

    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "failed to get MSI-X IRQ INFO");
        return;
    } else {
        vdev->msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
    }
    ...
    trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
                                msix->table_offset, msix->entries, vdev->msix->noresize);
    ....
}

> I'd expect this to happen in vfio_msix_early_setup() though, especially
> since that's where the remainder of VFIOMSIXInfo is setup.

> 
> > +    } else {
> > +        vdev->msix->irq_info_flags = irq_info.flags;
> > +    }
> > +    trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
> > +                                         vdev->msix->irq_info_flags);
> > +
> >      return 0;
> >  }
> >
> > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index
> > a2771b9ff3cc..ad34ec56d0ae 100644
> > --- a/hw/vfio/pci.h
> > +++ b/hw/vfio/pci.h
> > @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
> >      uint32_t table_offset;
> >      uint32_t pba_offset;
> >      unsigned long *pending;
> > +    uint32_t irq_info_flags;
> 
> Why not simply pull out a "noresize" bool?  Thanks,
> 
Will change to a bool type.

Thanks,
Jing

> Alex
> 
> >  } VFIOMSIXInfo;
> >
> >  #define TYPE_VFIO_PCI "vfio-pci"
> > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index
> > ee7509e68e4f..7d4a398f044d 100644
> > --- a/hw/vfio/trace-events
> > +++ b/hw/vfio/trace-events
> > @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int
> > len, int val) " (%s, @0x%x,  vfio_pci_write_config(const char *name, int addr, int
> val, int len) " (%s, @0x%x, 0x%x, len=0x%x)"
> >  vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
> >  vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int
> entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
> > +vfio_msix_setup_get_irq_info_failure(const char *errstr)
> "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
> > +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X
> irq info flags 0x%x"
> >  vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
> >  vfio_check_pm_reset(const char *name) "%s Supports PM reset"
> >  vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
Cédric Le Goater July 28, 2023, 8:27 a.m. UTC | #4
On 7/28/23 10:09, Liu, Jing2 wrote:
> Hi Alex,
> 
> Thanks very much for reviewing the patches.
> 
>> On July 28, 2023 1:25 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>
>> On Thu, 27 Jul 2023 03:24:08 -0400
>> Jing Liu <jing2.liu@intel.com> wrote:
>>
>>> From: Reinette Chatre <reinette.chatre@intel.com>
>>>
>>> Kernel provides the guidance of dynamic MSI-X allocation support of
>>> passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
>>> guide user space.
>>>
>>> Fetch and store the flags from host for later use to determine if
>>> specific flags are set.
>>>
>>> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
>>> Signed-off-by: Jing Liu <jing2.liu@intel.com>
>>> ---
>>>   hw/vfio/pci.c        | 12 ++++++++++++
>>>   hw/vfio/pci.h        |  1 +
>>>   hw/vfio/trace-events |  2 ++
>>>   3 files changed, 15 insertions(+)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index
>>> a205c6b1130f..0c4ac0873d40 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice
>>> *vdev, Error **errp)
>>>
>>>   static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error
>>> **errp)  {
>>> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
>>>       int ret;
>>>       Error *err = NULL;
>>>
>>> @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int
>> pos, Error **errp)
>>>           memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
>>>       }
>>>
>>> +    irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
>>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
>>> +    if (ret) {
>>> +        /* This can fail for an old kernel or legacy PCI dev */
>>> +        trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));
>>
>> We only call vfio_msix_setup() if the device has an MSI-X capability, so the
>> "legacy PCI" portion of this comment seems unjustified.
>> Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also question the
>> "old kernel" part of this comment.
> 
> Oh, yes, I just realize that only VFIO_PCI_ERR_IRQ_INDEX and
> VFIO_PCI_REQ_IRQ_INDEX were added later in include/uapi/linux/vfio.h. Thus,
> this ioctl() with MSIX index would not fail by the old-kernel or legacy-PCI reason.
> Thanks for pointing out this to me.
> 
> We don't currently sanity test the device
>> exposed MSI-X info versus that reported by GET_IRQ_INFO, but it seems valid to
>> do so.
> 
> Do we want to keep the check of possible failure from kernel (e.g., -EFAULT) and report
> the error code back to caller? Maybe like this,
> 
> static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> {
>      ....
>      msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
> 
>      ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
>      if (ret < 0) {
>          error_setg_errno(errp, -ret, "failed to get MSI-X IRQ INFO");
>          return;
>      } else {
>          vdev->msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
>      }
>      ...
>      trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
>                                  msix->table_offset, msix->entries, vdev->msix->noresize);

In the trace event, please ouput irq_info.flags since it gives more
information on the value returned by the kernel.

>      ....
> }
> 
>> I'd expect this to happen in vfio_msix_early_setup() though, especially
>> since that's where the remainder of VFIOMSIXInfo is setup.
> 
>>
>>> +    } else {
>>> +        vdev->msix->irq_info_flags = irq_info.flags;
>>> +    }
>>> +    trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
>>> +                                         vdev->msix->irq_info_flags);
>>> +
>>>       return 0;
>>>   }
>>>
>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index
>>> a2771b9ff3cc..ad34ec56d0ae 100644
>>> --- a/hw/vfio/pci.h
>>> +++ b/hw/vfio/pci.h
>>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
>>>       uint32_t table_offset;
>>>       uint32_t pba_offset;
>>>       unsigned long *pending;
>>> +    uint32_t irq_info_flags;
>>
>> Why not simply pull out a "noresize" bool?  Thanks,
>>
> Will change to a bool type.

I would simply cache the KVM flags value under VFIOMSIXInfo as you
did and add an helper. Both work the same but the intial proposal
keeps more information. This is minor.

Thanks,

C.
  
> 
> Thanks,
> Jing
> 
>> Alex
>>
>>>   } VFIOMSIXInfo;
>>>
>>>   #define TYPE_VFIO_PCI "vfio-pci"
>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index
>>> ee7509e68e4f..7d4a398f044d 100644
>>> --- a/hw/vfio/trace-events
>>> +++ b/hw/vfio/trace-events
>>> @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int
>>> len, int val) " (%s, @0x%x,  vfio_pci_write_config(const char *name, int addr, int
>> val, int len) " (%s, @0x%x, 0x%x, len=0x%x)"
>>>   vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
>>>   vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int
>> entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
>>> +vfio_msix_setup_get_irq_info_failure(const char *errstr)
>> "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
>>> +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X
>> irq info flags 0x%x"
>>>   vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
>>>   vfio_check_pm_reset(const char *name) "%s Supports PM reset"
>>>   vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
>
Jing Liu July 28, 2023, 8:34 a.m. UTC | #5
Hi C.,

Thanks very much for reviewing the patches.

> On July 28, 2023 12:58 AM, Cédric Le Goater <clg@redhat.com> wrote:
> 
> Hello Jing,
> 
> On 7/27/23 09:24, Jing Liu wrote:
> > From: Reinette Chatre <reinette.chatre@intel.com>
> >
> > Kernel provides the guidance of dynamic MSI-X allocation support of
> > passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
> > guide user space.
> >
> > Fetch and store the flags from host for later use to determine if
> > specific flags are set.
> >
> > Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
> > Signed-off-by: Jing Liu <jing2.liu@intel.com>
> > ---
> >   hw/vfio/pci.c        | 12 ++++++++++++
> >   hw/vfio/pci.h        |  1 +
> >   hw/vfio/trace-events |  2 ++
> >   3 files changed, 15 insertions(+)
> >
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index
> > a205c6b1130f..0c4ac0873d40 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice
> > *vdev, Error **errp)
> >
> >   static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> >   {
> > +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
> >       int ret;
> >       Error *err = NULL;
> >
> > @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int
> pos, Error **errp)
> >           memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
> >       }
> >
> > +    irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
> > +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> > +    if (ret) {
> > +        /* This can fail for an old kernel or legacy PCI dev */
> > +        trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));
> 
> Is it possible to detect the error reported by a kernel (< 6.4) when dynamic MSI-X
> are not supported. 

I just realized that ioctl(VFIO_DEVICE_GET_IRQ_INFO) with VFIO_PCI_MSIX_IRQ_INDEX
should always exists and would not cause failure for older kernel reason, after reading
Alex's comment on this patch. (If my understanding is correct)

So the possible failure might be other reason except old kernel or legacy PCI dev.

Looking at vfio_pci_ioctl_get_irq_info() in the kernel, could
> info.flags be tested against VFIO_IRQ_INFO_NORESIZE ?
> 
Sorry I didn't quite understand "info.flags be tested against VFIO_IRQ_INFO_NORESIZE".
I saw kernel < 6.4 simply added NORESIZE to info.flags and latest kernel adds if has_dyn_msix.
Would you please kindly describe more on your point?

> In that case, QEMU should report an error and the trace event is not needed.

I replied an email with new error handling draft code based on my understanding, which 
reports the error and need no trace. Could you please help review if that is what we want?

Thanks,
Jing

> 
> Thanks,
> 
> C.
> 
> > +    } else {
> > +        vdev->msix->irq_info_flags = irq_info.flags;
> > +    }
> > +    trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
> > +                                         vdev->msix->irq_info_flags);
> > +
> >       return 0;
> >   }
> >
> > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index
> > a2771b9ff3cc..ad34ec56d0ae 100644
> > --- a/hw/vfio/pci.h
> > +++ b/hw/vfio/pci.h
> > @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
> >       uint32_t table_offset;
> >       uint32_t pba_offset;
> >       unsigned long *pending;
> > +    uint32_t irq_info_flags;
> >   } VFIOMSIXInfo;
> >
> >   #define TYPE_VFIO_PCI "vfio-pci"
> > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index
> > ee7509e68e4f..7d4a398f044d 100644
> > --- a/hw/vfio/trace-events
> > +++ b/hw/vfio/trace-events
> > @@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int len,
> int val) " (%s, @0x%x,
> >   vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s,
> @0x%x, 0x%x, len=0x%x)"
> >   vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
> >   vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int
> entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
> > +vfio_msix_setup_get_irq_info_failure(const char *errstr)
> "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
> > +vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X
> irq info flags 0x%x"
> >   vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
> >   vfio_check_pm_reset(const char *name) "%s Supports PM reset"
> >   vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
Cédric Le Goater July 28, 2023, 8:43 a.m. UTC | #6
[ ... ]

> Sorry I didn't quite understand "info.flags be tested against VFIO_IRQ_INFO_NORESIZE".
> I saw kernel < 6.4 simply added NORESIZE to info.flags and latest kernel adds if has_dyn_msix.
> Would you please kindly describe more on your point?

I was trying to find the conditions to detect safely that the kernel didn't
have dynamic MSI-X support. Testing VFIO_IRQ_INFO_NORESIZE seems enough.

>> In that case, QEMU should report an error and the trace event is not needed.
> 
> I replied an email with new error handling draft code based on my understanding, which
> reports the error and need no trace. Could you please help review if that is what we want?

yes. It looked good. Please send a v1 !

Thanks,

Cédric.
Alex Williamson July 28, 2023, 3:41 p.m. UTC | #7
On Fri, 28 Jul 2023 10:27:17 +0200
Cédric Le Goater <clg@redhat.com> wrote:

> On 7/28/23 10:09, Liu, Jing2 wrote:
> > Hi Alex,
> > 
> > Thanks very much for reviewing the patches.
> >   
> >> On July 28, 2023 1:25 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> >>
> >> On Thu, 27 Jul 2023 03:24:08 -0400
> >> Jing Liu <jing2.liu@intel.com> wrote:
> >>  
> >>> From: Reinette Chatre <reinette.chatre@intel.com>
> >>>
> >>> Kernel provides the guidance of dynamic MSI-X allocation support of
> >>> passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
> >>> guide user space.
> >>>
> >>> Fetch and store the flags from host for later use to determine if
> >>> specific flags are set.
> >>>
> >>> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
> >>> Signed-off-by: Jing Liu <jing2.liu@intel.com>
> >>> ---
> >>>   hw/vfio/pci.c        | 12 ++++++++++++
> >>>   hw/vfio/pci.h        |  1 +
> >>>   hw/vfio/trace-events |  2 ++
> >>>   3 files changed, 15 insertions(+)
> >>>
> >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index
> >>> a205c6b1130f..0c4ac0873d40 100644
> >>> --- a/hw/vfio/pci.c
> >>> +++ b/hw/vfio/pci.c
> >>> @@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice
> >>> *vdev, Error **errp)
> >>>
> >>>   static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error
> >>> **errp)  {
> >>> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
> >>>       int ret;
> >>>       Error *err = NULL;
> >>>
> >>> @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int  
> >> pos, Error **errp)  
> >>>           memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
> >>>       }
> >>>
> >>> +    irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
> >>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> >>> +    if (ret) {
> >>> +        /* This can fail for an old kernel or legacy PCI dev */
> >>> +        trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));  
> >>
> >> We only call vfio_msix_setup() if the device has an MSI-X capability, so the
> >> "legacy PCI" portion of this comment seems unjustified.
> >> Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also question the
> >> "old kernel" part of this comment.  
> > 
> > Oh, yes, I just realize that only VFIO_PCI_ERR_IRQ_INDEX and
> > VFIO_PCI_REQ_IRQ_INDEX were added later in include/uapi/linux/vfio.h. Thus,
> > this ioctl() with MSIX index would not fail by the old-kernel or legacy-PCI reason.
> > Thanks for pointing out this to me.
> > 
> > We don't currently sanity test the device  
> >> exposed MSI-X info versus that reported by GET_IRQ_INFO, but it seems valid to
> >> do so.  
> > 
> > Do we want to keep the check of possible failure from kernel (e.g., -EFAULT) and report
> > the error code back to caller? Maybe like this,
> > 
> > static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> > {
> >      ....
> >      msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
> > 
> >      ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> >      if (ret < 0) {
> >          error_setg_errno(errp, -ret, "failed to get MSI-X IRQ INFO");
> >          return;

Yes.

> >      } else {
> >          vdev->msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
> >      }

No 'else' required here since the error branch returns.

> >      ...
> >      trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
> >                                  msix->table_offset, msix->entries, vdev->msix->noresize);  
> 
> In the trace event, please ouput irq_info.flags since it gives more
> information on the value returned by the kernel.
> 
> >      ....
> > }
> >   
> >> I'd expect this to happen in vfio_msix_early_setup() though, especially
> >> since that's where the remainder of VFIOMSIXInfo is setup.  
> >   
> >>  
> >>> +    } else {
> >>> +        vdev->msix->irq_info_flags = irq_info.flags;
> >>> +    }
> >>> +    trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
> >>> +                                         vdev->msix->irq_info_flags);
> >>> +
> >>>       return 0;
> >>>   }
> >>>
> >>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index
> >>> a2771b9ff3cc..ad34ec56d0ae 100644
> >>> --- a/hw/vfio/pci.h
> >>> +++ b/hw/vfio/pci.h
> >>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
> >>>       uint32_t table_offset;
> >>>       uint32_t pba_offset;
> >>>       unsigned long *pending;
> >>> +    uint32_t irq_info_flags;  
> >>
> >> Why not simply pull out a "noresize" bool?  Thanks,
> >>  
> > Will change to a bool type.  
> 
> I would simply cache the KVM flags value under VFIOMSIXInfo as you
> did and add an helper. Both work the same but the intial proposal
> keeps more information. This is minor.

TBH, I'd still prefer that we only store the one field we care about
and avoid an extra helper, regardless of how simple it might be.  Even
if we eventually add masking for MSI-X, we can store it in less space
and more accessibly decoded in the VFIOMSIXInfo struct vs helpers to
access a cached flags value.  Thanks,

Alex
Cédric Le Goater July 28, 2023, 3:51 p.m. UTC | #8
[ ... ]

>>>>> --- a/hw/vfio/pci.h
>>>>> +++ b/hw/vfio/pci.h
>>>>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
>>>>>        uint32_t table_offset;
>>>>>        uint32_t pba_offset;
>>>>>        unsigned long *pending;
>>>>> +    uint32_t irq_info_flags;
>>>>
>>>> Why not simply pull out a "noresize" bool?  Thanks,
>>>>   
>>> Will change to a bool type.
>>
>> I would simply cache the KVM flags value under VFIOMSIXInfo as you
>> did and add an helper. Both work the same but the intial proposal
>> keeps more information. This is minor.
> 
> TBH, I'd still prefer that we only store the one field we care about
> and avoid an extra helper, regardless of how simple it might be.  Even
> if we eventually add masking for MSI-X, we can store it in less space
> and more accessibly decoded in the VFIOMSIXInfo struct vs helpers to
> access a cached flags value.  Thanks,

Let's use a bool then. np.

Thanks,

C.
Jing Liu July 31, 2023, 3:51 a.m. UTC | #9
Hi Alex,

> On July 28, 2023 11:41 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Fri, 28 Jul 2023 10:27:17 +0200
> Cédric Le Goater <clg@redhat.com> wrote:
> 
> > On 7/28/23 10:09, Liu, Jing2 wrote:
> > > Hi Alex,
> > >
> > > Thanks very much for reviewing the patches.
> > >
> > >> On July 28, 2023 1:25 AM, Alex Williamson
> <alex.williamson@redhat.com> wrote:
> > >>
> > >> On Thu, 27 Jul 2023 03:24:08 -0400
> > >> Jing Liu <jing2.liu@intel.com> wrote:
> > >>
> > >>> From: Reinette Chatre <reinette.chatre@intel.com>
> > >>>
> > >>> Kernel provides the guidance of dynamic MSI-X allocation support
> > >>> of passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag
> > >>> to guide user space.
> > >>>
> > >>> Fetch and store the flags from host for later use to determine if
> > >>> specific flags are set.
> > >>>
> > >>> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
> > >>> Signed-off-by: Jing Liu <jing2.liu@intel.com>
> > >>> ---
> > >>>   hw/vfio/pci.c        | 12 ++++++++++++
> > >>>   hw/vfio/pci.h        |  1 +
> > >>>   hw/vfio/trace-events |  2 ++
> > >>>   3 files changed, 15 insertions(+)
> > >>>
> > >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index
> > >>> a205c6b1130f..0c4ac0873d40 100644
> > >>> --- a/hw/vfio/pci.c
> > >>> +++ b/hw/vfio/pci.c
> > >>> @@ -1572,6 +1572,7 @@ static void
> > >>> vfio_msix_early_setup(VFIOPCIDevice
> > >>> *vdev, Error **errp)
> > >>>
> > >>>   static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error
> > >>> **errp)  {
> > >>> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info)
> > >>> + };
> > >>>       int ret;
> > >>>       Error *err = NULL;
> > >>>
> > >>> @@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice
> > >>> *vdev, int
> > >> pos, Error **errp)
> > >>>           memory_region_set_enabled(&vdev->pdev.msix_table_mmio,
> false);
> > >>>       }
> > >>>
> > >>> +    irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
> > >>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO,
> &irq_info);
> > >>> +    if (ret) {
> > >>> +        /* This can fail for an old kernel or legacy PCI dev */
> > >>> +
> > >>> + trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));
> > >>
> > >> We only call vfio_msix_setup() if the device has an MSI-X
> > >> capability, so the "legacy PCI" portion of this comment seems unjustified.
> > >> Otherwise the GET_IRQ_INFO ioctl has always existed, so I'd also
> > >> question the "old kernel" part of this comment.
> > >
> > > Oh, yes, I just realize that only VFIO_PCI_ERR_IRQ_INDEX and
> > > VFIO_PCI_REQ_IRQ_INDEX were added later in
> > > include/uapi/linux/vfio.h. Thus, this ioctl() with MSIX index would not fail by
> the old-kernel or legacy-PCI reason.
> > > Thanks for pointing out this to me.
> > >
> > > We don't currently sanity test the device
> > >> exposed MSI-X info versus that reported by GET_IRQ_INFO, but it
> > >> seems valid to do so.
> > >
> > > Do we want to keep the check of possible failure from kernel (e.g.,
> > > -EFAULT) and report the error code back to caller? Maybe like this,
> > >
> > > static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> > > {
> > >      ....
> > >      msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
> > >
> > >      ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> > >      if (ret < 0) {
> > >          error_setg_errno(errp, -ret, "failed to get MSI-X IRQ INFO");
> > >          return;
> 
> Yes.
> 
> > >      } else {
> > >          vdev->msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
> > >      }
> 
> No 'else' required here since the error branch returns.

Oh, yes. Will remove the "else" and just set the ”noresize“ value in next version.

> 
> > >      ...
> > >      trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix-
> >table_bar,
> > >                                  msix->table_offset, msix->entries,
> > > vdev->msix->noresize);
> >
> > In the trace event, please ouput irq_info.flags since it gives more
> > information on the value returned by the kernel.
> >
> > >      ....
> > > }
> > >
> > >> I'd expect this to happen in vfio_msix_early_setup() though,
> > >> especially since that's where the remainder of VFIOMSIXInfo is setup.
> > >
> > >>
> > >>> +    } else {
> > >>> +        vdev->msix->irq_info_flags = irq_info.flags;
> > >>> +    }
> > >>> +    trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
> > >>> +
> > >>> + vdev->msix->irq_info_flags);
> > >>> +
> > >>>       return 0;
> > >>>   }
> > >>>
> > >>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index
> > >>> a2771b9ff3cc..ad34ec56d0ae 100644
> > >>> --- a/hw/vfio/pci.h
> > >>> +++ b/hw/vfio/pci.h
> > >>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
> > >>>       uint32_t table_offset;
> > >>>       uint32_t pba_offset;
> > >>>       unsigned long *pending;
> > >>> +    uint32_t irq_info_flags;
> > >>
> > >> Why not simply pull out a "noresize" bool?  Thanks,
> > >>
> > > Will change to a bool type.
> >
> > I would simply cache the KVM flags value under VFIOMSIXInfo as you did
> > and add an helper. Both work the same but the intial proposal keeps
> > more information. This is minor.
> 
> TBH, I'd still prefer that we only store the one field we care about and avoid an
> extra helper, regardless of how simple it might be.  Even if we eventually add
> masking for MSI-X, we can store it in less space and more accessibly decoded in
> the VFIOMSIXInfo struct vs helpers to access a cached flags value.  Thanks,
> 

Got it, thanks,

Jing

> Alex
Jing Liu July 31, 2023, 3:57 a.m. UTC | #10
Hi C.

> On July 28, 2023 4:44 PM, Cédric Le Goater <clg@redhat.com> wrote:
> 
> [ ... ]
> 
> > Sorry I didn't quite understand "info.flags be tested against
> VFIO_IRQ_INFO_NORESIZE".
> > I saw kernel < 6.4 simply added NORESIZE to info.flags and latest kernel adds
> if has_dyn_msix.
> > Would you please kindly describe more on your point?
> 
> I was trying to find the conditions to detect safely that the kernel didn't have
> dynamic MSI-X support. Testing VFIO_IRQ_INFO_NORESIZE seems enough.
> 
Oh, I see.

> >> In that case, QEMU should report an error and the trace event is not
> needed.
> >
> > I replied an email with new error handling draft code based on my
> > understanding, which reports the error and need no trace. Could you please
> help review if that is what we want?
> 
> yes. It looked good. Please send a v1 !

Thanks for reviewing that. I guess you mean v2 for next version 😊

Jing
> 
> Thanks,
> 
> Cédric.
Cédric Le Goater July 31, 2023, 7:25 a.m. UTC | #11
On 7/31/23 05:57, Liu, Jing2 wrote:
> Hi C.
> 
>> On July 28, 2023 4:44 PM, Cédric Le Goater <clg@redhat.com> wrote:
>>
>> [ ... ]
>>
>>> Sorry I didn't quite understand "info.flags be tested against
>> VFIO_IRQ_INFO_NORESIZE".
>>> I saw kernel < 6.4 simply added NORESIZE to info.flags and latest kernel adds
>> if has_dyn_msix.
>>> Would you please kindly describe more on your point?
>>
>> I was trying to find the conditions to detect safely that the kernel didn't have
>> dynamic MSI-X support. Testing VFIO_IRQ_INFO_NORESIZE seems enough.
>>
> Oh, I see.
> 
>>>> In that case, QEMU should report an error and the trace event is not
>> needed.
>>>
>>> I replied an email with new error handling draft code based on my
>>> understanding, which reports the error and need no trace. Could you please
>> help review if that is what we want?
>>
>> yes. It looked good. Please send a v1 !
> 
> Thanks for reviewing that. I guess you mean v2 for next version 😊

Well, if you remove the RFC status, I think you should, this would
still be a v1.

Thanks,

C.
Jing Liu July 31, 2023, 8:40 a.m. UTC | #12
Hi C.

> On July 31, 2023 3:26 PM, Cédric Le Goater <clg@redhat.com> wrote:
> 
> On 7/31/23 05:57, Liu, Jing2 wrote:
> > Hi C.
> >
> >> On July 28, 2023 4:44 PM, Cédric Le Goater <clg@redhat.com> wrote:
> >>
> >> [ ... ]
> >>
> >>> Sorry I didn't quite understand "info.flags be tested against
> >> VFIO_IRQ_INFO_NORESIZE".
> >>> I saw kernel < 6.4 simply added NORESIZE to info.flags and latest
> >>> kernel adds
> >> if has_dyn_msix.
> >>> Would you please kindly describe more on your point?
> >>
> >> I was trying to find the conditions to detect safely that the kernel
> >> didn't have dynamic MSI-X support. Testing VFIO_IRQ_INFO_NORESIZE
> seems enough.
> >>
> > Oh, I see.
> >
> >>>> In that case, QEMU should report an error and the trace event is
> >>>> not
> >> needed.
> >>>
> >>> I replied an email with new error handling draft code based on my
> >>> understanding, which reports the error and need no trace. Could you
> >>> please
> >> help review if that is what we want?
> >>
> >> yes. It looked good. Please send a v1 !
> >
> > Thanks for reviewing that. I guess you mean v2 for next version 😊
> 
> Well, if you remove the RFC status, I think you should, this would still be a v1.
> 
Oh, got it. Thanks for telling this.

Jing

> Thanks,
> 
> C.
diff mbox series

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a205c6b1130f..0c4ac0873d40 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1572,6 +1572,7 @@  static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
 
 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
 {
+    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
     int ret;
     Error *err = NULL;
 
@@ -1624,6 +1625,17 @@  static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
         memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
     }
 
+    irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+    if (ret) {
+        /* This can fail for an old kernel or legacy PCI dev */
+        trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));
+    } else {
+        vdev->msix->irq_info_flags = irq_info.flags;
+    }
+    trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
+                                         vdev->msix->irq_info_flags);
+
     return 0;
 }
 
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a2771b9ff3cc..ad34ec56d0ae 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -113,6 +113,7 @@  typedef struct VFIOMSIXInfo {
     uint32_t table_offset;
     uint32_t pba_offset;
     unsigned long *pending;
+    uint32_t irq_info_flags;
 } VFIOMSIXInfo;
 
 #define TYPE_VFIO_PCI "vfio-pci"
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index ee7509e68e4f..7d4a398f044d 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -28,6 +28,8 @@  vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, @0x%x,
 vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, @0x%x, 0x%x, len=0x%x)"
 vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
 vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
+vfio_msix_setup_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
+vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X irq info flags 0x%x"
 vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
 vfio_check_pm_reset(const char *name) "%s Supports PM reset"
 vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"