diff mbox

[v2,3/3] vfio-pci: process non fatal error of AER

Message ID 1490179012-14990-4-git-send-email-caoj.fnst@cn.fujitsu.com
State New
Headers show

Commit Message

Cao jin March 22, 2017, 10:36 a.m. UTC
Make use of the non fatal error eventfd that the kernel module provide
to process the AER non fatal error. Fatal error still goes into the
legacy way which results in VM stop.

Register the handler, wait for notification. Construct aer message and
pass it to root port on notification. Root port will trigger an interrupt
to signal guest, then guest driver will do the recovery.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c              | 247 +++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/pci.h              |   4 +
 linux-headers/linux/vfio.h |   2 +
 3 files changed, 253 insertions(+)

Comments

Michael S. Tsirkin March 22, 2017, 1:27 p.m. UTC | #1
On Wed, Mar 22, 2017 at 06:36:52PM +0800, Cao jin wrote:
> Make use of the non fatal error eventfd that the kernel module provide
> to process the AER non fatal error. Fatal error still goes into the
> legacy way which results in VM stop.
> 
> Register the handler, wait for notification. Construct aer message and
> pass it to root port on notification. Root port will trigger an interrupt
> to signal guest, then guest driver will do the recovery.
> 
> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c              | 247 +++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/pci.h              |   4 +
>  linux-headers/linux/vfio.h |   2 +
>  3 files changed, 253 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 3d0d005..4912bc6 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2422,6 +2422,34 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
>                       "Could not enable error recovery for the device",
>                       vbasedev->name);
>      }
> +
> +    irq_info.index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
> +    irq_info.count = 0; /* clear */
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> +    if (ret) {
> +        /* This can fail for an old kernel or legacy PCI dev */
> +        trace_vfio_populate_device_get_irq_info_failure();
> +    } else if (irq_info.count == 1) {
> +        vdev->pci_aer_non_fatal = true;
> +    } else {
> +        error_report(WARN_PREFIX
> +                     "Couldn't enable non fatal error recovery for the device",
> +                     vbasedev->name);

when does this trigger?

> +    }
> +
> +    irq_info.index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
> +    irq_info.count = 0; /* clear */
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> +    if (ret) {
> +        /* This can fail for an old kernel or legacy PCI dev */
> +        trace_vfio_populate_device_get_irq_info_failure();
> +    } else if (irq_info.count == 1) {
> +        vdev->passive_reset = true;
> +    } else {
> +        error_report(WARN_PREFIX
> +                     "Don't support passive reset notification",
> +                     vbasedev->name);

when does this happen?
what does this message mean?

> +    }
>  }
>  
>  static void vfio_put_device(VFIOPCIDevice *vdev)
> @@ -2432,6 +2460,221 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>      vfio_put_base_device(&vdev->vbasedev);
>  }
>  
> +static void vfio_non_fatal_err_notifier_handler(void *opaque)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIEAERMsg msg = {
> +        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
> +    };
> +

Should this just use pci_requester_id?


At least Peter thought so.

> +    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
> +        return;
> +    }
> +
> +    /* Populate the aer msg and send it to root port */
> +    if (dev->exp.aer_cap) {
> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +        uint32_t uncor_status;
> +        bool isfatal;
> +
> +        uncor_status = vfio_pci_read_config(dev,
> +                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> +        if (!uncor_status) {
> +            return;
> +        }
> +
> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +        if (isfatal) {
> +            goto stop;
> +        }
> +
> +        error_report("%s sending non fatal event to root port. uncor status = "
> +                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
> +        pcie_aer_msg(dev, &msg);
> +        return;
> +    }
> +
> +stop:
> +    /* Terminate the guest in case of fatal error */
> +    error_report("%s(%s) fatal error detected. Please collect any data"
> +            " possible and then kill the guest", __func__, vdev->vbasedev.name);

"Device detected a fatal error. VM stopped".

would be better IMO.


> +    vm_stop(RUN_STATE_INTERNAL_ERROR);
> +}
> +
> +/*
> + * Register non fatal error notifier for devices supporting error recovery.
> + * If we encounter a failure in this function, we report an error
> + * and continue after disabling error recovery support for the device.
> + */
> +static void vfio_register_non_fatal_err_notifier(VFIOPCIDevice *vdev)
> +{
> +    int ret;
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +
> +    if (!vdev->pci_aer_non_fatal) {
> +        return;
> +    }
> +
> +    if (event_notifier_init(&vdev->non_fatal_err_notifier, 0)) {
> +        error_report("vfio: Unable to init event notifier for non-fatal error detection");
> +        vdev->pci_aer_non_fatal = false;
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = event_notifier_get_fd(&vdev->non_fatal_err_notifier);
> +    qemu_set_fd_handler(*pfd, vfio_non_fatal_err_notifier_handler, NULL, vdev);
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to set up non-fatal error notification");
> +        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
> +        event_notifier_cleanup(&vdev->non_fatal_err_notifier);
> +        vdev->pci_aer_non_fatal = false;
> +    }
> +    g_free(irq_set);
> +}
> +
> +static void vfio_unregister_non_fatal_err_notifier(VFIOPCIDevice *vdev)
> +{
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +    int ret;
> +
> +    if (!vdev->pci_aer_non_fatal) {
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +    *pfd = -1;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to de-assign error fd: %m");
> +    }
> +    g_free(irq_set);
> +    qemu_set_fd_handler(event_notifier_get_fd(&vdev->non_fatal_err_notifier),
> +                        NULL, NULL, vdev);
> +    event_notifier_cleanup(&vdev->non_fatal_err_notifier);
> +}
> +
> +static void vfio_passive_reset_notifier_handler(void *opaque)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +
> +    if (!event_notifier_test_and_clear(&vdev->passive_reset_notifier)) {
> +        return;
> +    }
> +
> +    error_report("device %s is forced to be reset. Please collect any data"
> +            " possible and then kill the guest", vdev->vbasedev.name);

I think we should just tell what happened in detail. Don't tell user what to do:

"Device lost state due to host device reset. VM stopped".


> +    vm_stop(RUN_STATE_INTERNAL_ERROR);
> +}
> +
> +/*
> + * Register passive reset notifier, in case of certain function of a
> + * multifunction device is passthroughed,  while other functions are still
> + * controlled by device driver.
> + */
> +static void vfio_register_passive_reset_notifier(VFIOPCIDevice *vdev)
> +{
> +    int ret;
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +
> +    error_report("Registering event notifier for passive reset");
> +    if (!vdev->passive_reset) {
> +        return;
> +    }
> +
> +    if (event_notifier_init(&vdev->passive_reset_notifier, 0)) {
> +        error_report("vfio: Unable to init event notifier for passive reset");
> +        vdev->passive_reset = false;
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = event_notifier_get_fd(&vdev->passive_reset_notifier);
> +    qemu_set_fd_handler(*pfd, vfio_passive_reset_notifier_handler, NULL, vdev);
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to set up passive reset notification");
> +        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
> +        event_notifier_cleanup(&vdev->passive_reset_notifier);
> +        vdev->passive_reset = false;
> +    }
> +    g_free(irq_set);
> +}
> +
> +static void vfio_unregister_passive_reset_notifier(VFIOPCIDevice *vdev)
> +{
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +    int ret;
> +
> +    if (!vdev->passive_reset) {
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +    *pfd = -1;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to de-assign error fd: %m");
> +    }
> +    g_free(irq_set);
> +    qemu_set_fd_handler(event_notifier_get_fd(&vdev->passive_reset_notifier),
> +                        NULL, NULL, vdev);
> +    event_notifier_cleanup(&vdev->passive_reset_notifier);
> +}
> +
>  static void vfio_err_notifier_handler(void *opaque)
>  {
>      VFIOPCIDevice *vdev = opaque;
> @@ -2860,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>          }
>      }
>  
> +    vfio_register_passive_reset_notifier(vdev);
> +    vfio_register_non_fatal_err_notifier(vdev);
>      vfio_register_err_notifier(vdev);
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
> @@ -2900,6 +3145,8 @@ static void vfio_exitfn(PCIDevice *pdev)
>  
>      vfio_unregister_req_notifier(vdev);
>      vfio_unregister_err_notifier(vdev);
> +    vfio_unregister_non_fatal_err_notifier(vdev);
> +    vfio_unregister_passive_reset_notifier(vdev);
>      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
>      vfio_disable_interrupts(vdev);
>      if (vdev->intx.mmap_timer) {
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 34e8b04..dcb0e0a 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -119,6 +119,8 @@ typedef struct VFIOPCIDevice {
>      void *igd_opregion;
>      PCIHostDeviceAddress host;
>      EventNotifier err_notifier;
> +    EventNotifier non_fatal_err_notifier;
> +    EventNotifier passive_reset_notifier;
>      EventNotifier req_notifier;
>      int (*resetfn)(struct VFIOPCIDevice *);
>      uint32_t vendor_id;
> @@ -137,6 +139,8 @@ typedef struct VFIOPCIDevice {
>      uint32_t igd_gms;
>      uint8_t pm_cap;
>      bool pci_aer;
> +    bool pci_aer_non_fatal;
> +    bool passive_reset;
>      bool req_enabled;
>      bool has_flr;
>      bool has_pm_reset;

Why do you need these flags? Why isn't the notifier sufficient?

> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 759b850..726ddbe 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -433,6 +433,8 @@ enum {
>  	VFIO_PCI_MSIX_IRQ_INDEX,
>  	VFIO_PCI_ERR_IRQ_INDEX,
>  	VFIO_PCI_REQ_IRQ_INDEX,
> +	VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX,
> +	VFIO_PCI_PASSIVE_RESET_IRQ_INDEX,
>  	VFIO_PCI_NUM_IRQS
>  };
>  
> -- 
> 1.8.3.1
> 
>
Cao jin March 23, 2017, 2:58 a.m. UTC | #2
On 03/22/2017 09:27 PM, Michael S. Tsirkin wrote:
> On Wed, Mar 22, 2017 at 06:36:52PM +0800, Cao jin wrote:
>> Make use of the non fatal error eventfd that the kernel module provide
>> to process the AER non fatal error. Fatal error still goes into the
>> legacy way which results in VM stop.
>>
>> Register the handler, wait for notification. Construct aer message and
>> pass it to root port on notification. Root port will trigger an interrupt
>> to signal guest, then guest driver will do the recovery.
>>
>> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
>> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
>> ---
>>  hw/vfio/pci.c              | 247 +++++++++++++++++++++++++++++++++++++++++++++
>>  hw/vfio/pci.h              |   4 +
>>  linux-headers/linux/vfio.h |   2 +
>>  3 files changed, 253 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 3d0d005..4912bc6 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2422,6 +2422,34 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
>>                       "Could not enable error recovery for the device",
>>                       vbasedev->name);
>>      }
>> +
>> +    irq_info.index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
>> +    irq_info.count = 0; /* clear */
>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
>> +    if (ret) {
>> +        /* This can fail for an old kernel or legacy PCI dev */
>> +        trace_vfio_populate_device_get_irq_info_failure();
>> +    } else if (irq_info.count == 1) {
>> +        vdev->pci_aer_non_fatal = true;
>> +    } else {
>> +        error_report(WARN_PREFIX
>> +                     "Couldn't enable non fatal error recovery for the device",
>> +                     vbasedev->name);
> 
> when does this trigger?
> 
>> +    }
>> +
>> +    irq_info.index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
>> +    irq_info.count = 0; /* clear */
>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
>> +    if (ret) {
>> +        /* This can fail for an old kernel or legacy PCI dev */
>> +        trace_vfio_populate_device_get_irq_info_failure();
>> +    } else if (irq_info.count == 1) {
>> +        vdev->passive_reset = true;
>> +    } else {
>> +        error_report(WARN_PREFIX
>> +                     "Don't support passive reset notification",
>> +                     vbasedev->name);
> 
> when does this happen?
> what does this message mean?
> 

Both are boilerplate code as err_notifier. They will be triggered by
running latest QEMU on older kernel.  Will drop these code & the flags

>> +    }
>>  }
>>  
>>  static void vfio_put_device(VFIOPCIDevice *vdev)
>> @@ -2432,6 +2460,221 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>>      vfio_put_base_device(&vdev->vbasedev);
>>  }
>>  
>> +static void vfio_non_fatal_err_notifier_handler(void *opaque)
>> +{
>> +    VFIOPCIDevice *vdev = opaque;
>> +    PCIDevice *dev = &vdev->pdev;
>> +    PCIEAERMsg msg = {
>> +        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
>> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>> +    };
>> +
> 
> Should this just use pci_requester_id?
> 
> 
> At least Peter thought so.
> 
>> +    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
>> +        return;
>> +    }
>> +
>> +    /* Populate the aer msg and send it to root port */
>> +    if (dev->exp.aer_cap) {
>> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>> +        uint32_t uncor_status;
>> +        bool isfatal;
>> +
>> +        uncor_status = vfio_pci_read_config(dev,
>> +                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
>> +        if (!uncor_status) {
>> +            return;
>> +        }
>> +
>> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
>> +        if (isfatal) {
>> +            goto stop;
>> +        }
>> +
>> +        error_report("%s sending non fatal event to root port. uncor status = "
>> +                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
>> +        pcie_aer_msg(dev, &msg);
>> +        return;
>> +    }
>> +
>> +stop:
>> +    /* Terminate the guest in case of fatal error */
>> +    error_report("%s(%s) fatal error detected. Please collect any data"
>> +            " possible and then kill the guest", __func__, vdev->vbasedev.name);
> 
> "Device detected a fatal error. VM stopped".
> 
> would be better IMO.
> 

Yes, user has no way to collect any data.

>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 34e8b04..dcb0e0a 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -119,6 +119,8 @@ typedef struct VFIOPCIDevice {
>>      void *igd_opregion;
>>      PCIHostDeviceAddress host;
>>      EventNotifier err_notifier;
>> +    EventNotifier non_fatal_err_notifier;
>> +    EventNotifier passive_reset_notifier;
>>      EventNotifier req_notifier;
>>      int (*resetfn)(struct VFIOPCIDevice *);
>>      uint32_t vendor_id;
>> @@ -137,6 +139,8 @@ typedef struct VFIOPCIDevice {
>>      uint32_t igd_gms;
>>      uint8_t pm_cap;
>>      bool pci_aer;
>> +    bool pci_aer_non_fatal;
>> +    bool passive_reset;
>>      bool req_enabled;
>>      bool has_flr;
>>      bool has_pm_reset;
> 
> Why do you need these flags? Why isn't the notifier sufficient?
> 

sounds good. Will remove pci_aer also
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 3d0d005..4912bc6 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2422,6 +2422,34 @@  static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
                      "Could not enable error recovery for the device",
                      vbasedev->name);
     }
+
+    irq_info.index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
+    irq_info.count = 0; /* clear */
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+    if (ret) {
+        /* This can fail for an old kernel or legacy PCI dev */
+        trace_vfio_populate_device_get_irq_info_failure();
+    } else if (irq_info.count == 1) {
+        vdev->pci_aer_non_fatal = true;
+    } else {
+        error_report(WARN_PREFIX
+                     "Couldn't enable non fatal error recovery for the device",
+                     vbasedev->name);
+    }
+
+    irq_info.index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
+    irq_info.count = 0; /* clear */
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+    if (ret) {
+        /* This can fail for an old kernel or legacy PCI dev */
+        trace_vfio_populate_device_get_irq_info_failure();
+    } else if (irq_info.count == 1) {
+        vdev->passive_reset = true;
+    } else {
+        error_report(WARN_PREFIX
+                     "Don't support passive reset notification",
+                     vbasedev->name);
+    }
 }
 
 static void vfio_put_device(VFIOPCIDevice *vdev)
@@ -2432,6 +2460,221 @@  static void vfio_put_device(VFIOPCIDevice *vdev)
     vfio_put_base_device(&vdev->vbasedev);
 }
 
+static void vfio_non_fatal_err_notifier_handler(void *opaque)
+{
+    VFIOPCIDevice *vdev = opaque;
+    PCIDevice *dev = &vdev->pdev;
+    PCIEAERMsg msg = {
+        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
+        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+    };
+
+    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
+        return;
+    }
+
+    /* Populate the aer msg and send it to root port */
+    if (dev->exp.aer_cap) {
+        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+        uint32_t uncor_status;
+        bool isfatal;
+
+        uncor_status = vfio_pci_read_config(dev,
+                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+        if (!uncor_status) {
+            return;
+        }
+
+        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+        if (isfatal) {
+            goto stop;
+        }
+
+        error_report("%s sending non fatal event to root port. uncor status = "
+                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
+        pcie_aer_msg(dev, &msg);
+        return;
+    }
+
+stop:
+    /* Terminate the guest in case of fatal error */
+    error_report("%s(%s) fatal error detected. Please collect any data"
+            " possible and then kill the guest", __func__, vdev->vbasedev.name);
+    vm_stop(RUN_STATE_INTERNAL_ERROR);
+}
+
+/*
+ * Register non fatal error notifier for devices supporting error recovery.
+ * If we encounter a failure in this function, we report an error
+ * and continue after disabling error recovery support for the device.
+ */
+static void vfio_register_non_fatal_err_notifier(VFIOPCIDevice *vdev)
+{
+    int ret;
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+
+    if (!vdev->pci_aer_non_fatal) {
+        return;
+    }
+
+    if (event_notifier_init(&vdev->non_fatal_err_notifier, 0)) {
+        error_report("vfio: Unable to init event notifier for non-fatal error detection");
+        vdev->pci_aer_non_fatal = false;
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = event_notifier_get_fd(&vdev->non_fatal_err_notifier);
+    qemu_set_fd_handler(*pfd, vfio_non_fatal_err_notifier_handler, NULL, vdev);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to set up non-fatal error notification");
+        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
+        event_notifier_cleanup(&vdev->non_fatal_err_notifier);
+        vdev->pci_aer_non_fatal = false;
+    }
+    g_free(irq_set);
+}
+
+static void vfio_unregister_non_fatal_err_notifier(VFIOPCIDevice *vdev)
+{
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+    int ret;
+
+    if (!vdev->pci_aer_non_fatal) {
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+    *pfd = -1;
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to de-assign error fd: %m");
+    }
+    g_free(irq_set);
+    qemu_set_fd_handler(event_notifier_get_fd(&vdev->non_fatal_err_notifier),
+                        NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->non_fatal_err_notifier);
+}
+
+static void vfio_passive_reset_notifier_handler(void *opaque)
+{
+    VFIOPCIDevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->passive_reset_notifier)) {
+        return;
+    }
+
+    error_report("device %s is forced to be reset. Please collect any data"
+            " possible and then kill the guest", vdev->vbasedev.name);
+    vm_stop(RUN_STATE_INTERNAL_ERROR);
+}
+
+/*
+ * Register passive reset notifier, in case of certain function of a
+ * multifunction device is passthroughed,  while other functions are still
+ * controlled by device driver.
+ */
+static void vfio_register_passive_reset_notifier(VFIOPCIDevice *vdev)
+{
+    int ret;
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+
+    error_report("Registering event notifier for passive reset");
+    if (!vdev->passive_reset) {
+        return;
+    }
+
+    if (event_notifier_init(&vdev->passive_reset_notifier, 0)) {
+        error_report("vfio: Unable to init event notifier for passive reset");
+        vdev->passive_reset = false;
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = event_notifier_get_fd(&vdev->passive_reset_notifier);
+    qemu_set_fd_handler(*pfd, vfio_passive_reset_notifier_handler, NULL, vdev);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to set up passive reset notification");
+        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
+        event_notifier_cleanup(&vdev->passive_reset_notifier);
+        vdev->passive_reset = false;
+    }
+    g_free(irq_set);
+}
+
+static void vfio_unregister_passive_reset_notifier(VFIOPCIDevice *vdev)
+{
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+    int ret;
+
+    if (!vdev->passive_reset) {
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+    *pfd = -1;
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to de-assign error fd: %m");
+    }
+    g_free(irq_set);
+    qemu_set_fd_handler(event_notifier_get_fd(&vdev->passive_reset_notifier),
+                        NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->passive_reset_notifier);
+}
+
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
@@ -2860,6 +3103,8 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
         }
     }
 
+    vfio_register_passive_reset_notifier(vdev);
+    vfio_register_non_fatal_err_notifier(vdev);
     vfio_register_err_notifier(vdev);
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
@@ -2900,6 +3145,8 @@  static void vfio_exitfn(PCIDevice *pdev)
 
     vfio_unregister_req_notifier(vdev);
     vfio_unregister_err_notifier(vdev);
+    vfio_unregister_non_fatal_err_notifier(vdev);
+    vfio_unregister_passive_reset_notifier(vdev);
     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
     vfio_disable_interrupts(vdev);
     if (vdev->intx.mmap_timer) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 34e8b04..dcb0e0a 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -119,6 +119,8 @@  typedef struct VFIOPCIDevice {
     void *igd_opregion;
     PCIHostDeviceAddress host;
     EventNotifier err_notifier;
+    EventNotifier non_fatal_err_notifier;
+    EventNotifier passive_reset_notifier;
     EventNotifier req_notifier;
     int (*resetfn)(struct VFIOPCIDevice *);
     uint32_t vendor_id;
@@ -137,6 +139,8 @@  typedef struct VFIOPCIDevice {
     uint32_t igd_gms;
     uint8_t pm_cap;
     bool pci_aer;
+    bool pci_aer_non_fatal;
+    bool passive_reset;
     bool req_enabled;
     bool has_flr;
     bool has_pm_reset;
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 759b850..726ddbe 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -433,6 +433,8 @@  enum {
 	VFIO_PCI_MSIX_IRQ_INDEX,
 	VFIO_PCI_ERR_IRQ_INDEX,
 	VFIO_PCI_REQ_IRQ_INDEX,
+	VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX,
+	VFIO_PCI_PASSIVE_RESET_IRQ_INDEX,
 	VFIO_PCI_NUM_IRQS
 };