diff mbox

[v16,13/14] vfio-pci: pass the aer error to guest

Message ID 19b185dd9e150c23d452a4af1db37288e919c9ba.1452564770.git.chen.fan.fnst@cn.fujitsu.com
State New
Headers show

Commit Message

Cao jin Jan. 12, 2016, 2:43 a.m. UTC
From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

when the vfio device encounters an uncorrectable error in host,
the vfio_pci driver will signal the eventfd registered by this
vfio device, the results in the qemu eventfd handler getting
invoked.

this patch is to pass the error to guest and have the guest driver
recover from the error.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 6 deletions(-)

Comments

Marcel Apfelbaum Jan. 18, 2016, 10:45 a.m. UTC | #1
On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> when the vfio device encounters an uncorrectable error in host,
> the vfio_pci driver will signal the eventfd registered by this
> vfio device, the results in the qemu eventfd handler getting

Maybe "the results in" -> resulting in

> invoked.
>
> this patch is to pass the error to guest and have the guest driver
> recover from the error.

Maybe "Pass the error to... and let the ... "

>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++------
>   1 file changed, 47 insertions(+), 6 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index da4815e..efa5e01 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2553,18 +2553,59 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>   static void vfio_err_notifier_handler(void *opaque)
>   {
>       VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIEAERMsg msg = {
> +        .severity = 0,
> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
> +    };
>
>       if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>           return;
>       }
>
>       /*
> -     * TBD. Retrieve the error details and decide what action
> -     * needs to be taken. One of the actions could be to pass
> -     * the error to the guest and have the guest driver recover
> -     * from the error. This requires that PCIe capabilities be
> -     * exposed to the guest. For now, we just terminate the
> -     * guest to contain the error.
> +     * in case the real hardware configration has been changed,

configration -> configuration


> +     * here we should recheck the bus reset capability.
> +     */
> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
> +        vfio_check_host_bus_reset(vdev)) {
> +        goto stop;
> +    }
> +    /*
> +     * we should read the error details from the real hardware
> +     * configuration spaces, here we only need to do is signaling
> +     * to guest an uncorrectable error has occurred.
> +     */
> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
> +        dev->exp.aer_cap) {

Why do we need dev->exp.aer_cap check here? In patch 7/14 we fail the device init
process if this happens, right?

> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +        uint32_t uncor_status;
> +        bool isfatal;
> +
> +        uncor_status = vfio_pci_read_config(dev,
> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> +
> +        /*
> +         * if we receive the error signal but not this device, we can

maybe "if the error is not emitted by this device..."


Thanks,
Marcel

> +         * just ignore it.
> +         */
> +        if (!(uncor_status & ~0UL)) {
> +            return;
> +        }
> +
> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +
> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
> +
> +        pcie_aer_msg(dev, &msg);
> +        return;
> +    }
> +
> +stop:
> +    /*
> +     * If the aer capability is not exposed to the guest. we just
> +     * terminate the guest to contain the error.
>        */
>
>       error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
>
chenfan Jan. 19, 2016, 9:27 a.m. UTC | #2
On 01/18/2016 06:45 PM, Marcel Apfelbaum wrote:
> On 01/12/2016 04:43 AM, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> when the vfio device encounters an uncorrectable error in host,
>> the vfio_pci driver will signal the eventfd registered by this
>> vfio device, the results in the qemu eventfd handler getting
>
> Maybe "the results in" -> resulting in
>
>> invoked.
>>
>> this patch is to pass the error to guest and have the guest driver
>> recover from the error.
>
> Maybe "Pass the error to... and let the ... "
>
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 53 
>> +++++++++++++++++++++++++++++++++++++++++++++++------
>>   1 file changed, 47 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index da4815e..efa5e01 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2553,18 +2553,59 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>>   static void vfio_err_notifier_handler(void *opaque)
>>   {
>>       VFIOPCIDevice *vdev = opaque;
>> +    PCIDevice *dev = &vdev->pdev;
>> +    PCIEAERMsg msg = {
>> +        .severity = 0,
>> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>> +    };
>>
>>       if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>           return;
>>       }
>>
>>       /*
>> -     * TBD. Retrieve the error details and decide what action
>> -     * needs to be taken. One of the actions could be to pass
>> -     * the error to the guest and have the guest driver recover
>> -     * from the error. This requires that PCIe capabilities be
>> -     * exposed to the guest. For now, we just terminate the
>> -     * guest to contain the error.
>> +     * in case the real hardware configration has been changed,
>
> configration -> configuration
>
>
>> +     * here we should recheck the bus reset capability.
>> +     */
>> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
>> +        vfio_check_host_bus_reset(vdev)) {
>> +        goto stop;
>> +    }
>> +    /*
>> +     * we should read the error details from the real hardware
>> +     * configuration spaces, here we only need to do is signaling
>> +     * to guest an uncorrectable error has occurred.
>> +     */
>> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
>> +        dev->exp.aer_cap) {
>
> Why do we need dev->exp.aer_cap check here? In patch 7/14 we fail the 
> device init
> process if this happens, right?

the property FEATURE_ENABLE_AER can't represent the vfio device actually 
has the aer
capability. so here we should check it.

>
>> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>> +        uint32_t uncor_status;
>> +        bool isfatal;
>> +
>> +        uncor_status = vfio_pci_read_config(dev,
>> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
>> +
>> +        /*
>> +         * if we receive the error signal but not this device, we can
>
> maybe "if the error is not emitted by this device..."

thank you for your careful review for my bad english description in the 
patchset,
I will update them in the next version.

Thanks,
Chen

>
>
> Thanks,
> Marcel
>
>> +         * just ignore it.
>> +         */
>> +        if (!(uncor_status & ~0UL)) {
>> +            return;
>> +        }
>> +
>> +        isfatal = uncor_status & pci_get_long(aer_cap + 
>> PCI_ERR_UNCOR_SEVER);
>> +
>> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
>> +
>> +        pcie_aer_msg(dev, &msg);
>> +        return;
>> +    }
>> +
>> +stop:
>> +    /*
>> +     * If the aer capability is not exposed to the guest. we just
>> +     * terminate the guest to contain the error.
>>        */
>>
>>       error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error 
>> detected.  "
>>
>
>
>
> .
>
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index da4815e..efa5e01 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2553,18 +2553,59 @@  static void vfio_put_device(VFIOPCIDevice *vdev)
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
+    PCIDevice *dev = &vdev->pdev;
+    PCIEAERMsg msg = {
+        .severity = 0,
+        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+    };
 
     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
         return;
     }
 
     /*
-     * TBD. Retrieve the error details and decide what action
-     * needs to be taken. One of the actions could be to pass
-     * the error to the guest and have the guest driver recover
-     * from the error. This requires that PCIe capabilities be
-     * exposed to the guest. For now, we just terminate the
-     * guest to contain the error.
+     * in case the real hardware configration has been changed,
+     * here we should recheck the bus reset capability.
+     */
+    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+        vfio_check_host_bus_reset(vdev)) {
+        goto stop;
+    }
+    /*
+     * we should read the error details from the real hardware
+     * configuration spaces, here we only need to do is signaling
+     * to guest an uncorrectable error has occurred.
+     */
+    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+        dev->exp.aer_cap) {
+        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+        uint32_t uncor_status;
+        bool isfatal;
+
+        uncor_status = vfio_pci_read_config(dev,
+                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+
+        /*
+         * if we receive the error signal but not this device, we can
+         * just ignore it.
+         */
+        if (!(uncor_status & ~0UL)) {
+            return;
+        }
+
+        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+
+        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
+                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
+
+        pcie_aer_msg(dev, &msg);
+        return;
+    }
+
+stop:
+    /*
+     * If the aer capability is not exposed to the guest. we just
+     * terminate the guest to contain the error.
      */
 
     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "