@@ -115,7 +115,7 @@ The actual steps taken by a platform to recover from a PCI error
event will be platform-dependent, but will follow the general
sequence described below.
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
-------------------
A PCI bus error is detected by the PCI hardware. On powerpc, the slot
is isolated, in that all I/O is blocked: all reads return 0xffffffff,
@@ -160,10 +160,10 @@ particular, if the platform doesn't isolate slots), and recovery
proceeds to STEP 2 (MMIO Enable).
If any driver requested a slot reset (by returning PCI_ERS_RESULT_NEED_RESET),
-then recovery proceeds to STEP 4 (Slot Reset).
+then recovery proceeds to STEP 3 (Slot Reset).
If the platform is unable to recover the slot, the next step
-is STEP 6 (Permanent Failure).
+is STEP 5 (Permanent Failure).
.. note::
@@ -198,7 +198,7 @@ reset or some such, but not restart operations. This callback is made if
all drivers on a segment agree that they can try to recover and if no automatic
link reset was performed by the HW. If the platform can't just re-enable IOs
without a slot reset or a link reset, it will not call this callback, and
-instead will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset)
+instead will have gone directly to STEP 3 (Slot Reset)
.. note::
@@ -233,18 +233,12 @@ The driver should return one of the following result codes:
The next step taken depends on the results returned by the drivers.
If all drivers returned PCI_ERS_RESULT_RECOVERED, then the platform
-proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
+proceeds to STEP 4 (Resume Operations).
If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
-proceeds to STEP 4 (Slot Reset)
+proceeds to STEP 3 (Slot Reset)
-STEP 3: Link Reset
-------------------
-The platform resets the link. This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
------------------
In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -322,7 +316,7 @@ PCI card types::
+ pdev->needs_freset = 1;
+
-Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent
+Platform proceeds either to STEP 4 (Resume Operations) or STEP 5 (Permanent
Failure).
.. note::
@@ -332,7 +326,7 @@ Failure).
However, it probably should.
-STEP 5: Resume Operations
+STEP 4: Resume Operations
-------------------------
The platform will call the resume() callback on all affected device
drivers if all drivers on the segment have returned
@@ -344,7 +338,7 @@ a result code.
At this point, if a new error happens, the platform will restart
a new error recovery sequence.
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
-------------------------
A "permanent failure" has occurred, and the platform cannot recover
the device. The platform will call error_detected() with a
@@ -367,6 +361,27 @@ errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
for additional detail on real-life experience of the causes of
software errors.
+STEP 0: Error Event: ERR_FATAL
+--------------------
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0xffffffff, all
+writes are ignored.
+
+STEP 1: Remove devices
+---------------------
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+---------------------
+The platform resets the link. This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+---------------------
+Initiates the re-enumeration.
Conclusion; General Remarks
---------------------------
@@ -206,7 +206,7 @@ error_detected(dev, pci_channel_io_frozen) to all drivers within
a hierarchy in question. Then, performing link reset at upstream is
necessary. As different kinds of devices might use different approaches
to reset link, AER port service driver is required to provide the
-function to reset link via callback parameter of pcie_do_recovery()
+function to reset link via callback parameter of pcie_do_fatal_recovery()
function. If reset_link is not NULL, recovery function will use it
to reset the link. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER
and reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
@@ -555,8 +555,9 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
#endif
/* PCI error reporting and recovery */
-pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
- pci_channel_state_t state,
+pci_ers_result_t pcie_do_nonfatal_recovery(struct pci_dev *dev);
+
+pci_ers_result_t pcie_do_fatal_recovery(struct pci_dev *dev,
pci_ers_result_t (*reset_link)(struct pci_dev *pdev));
bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
@@ -947,9 +947,9 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
if (pcie_aer_is_native(dev))
pcie_clear_device_status(dev);
} else if (info->severity == AER_NONFATAL)
- pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
+ pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
- pcie_do_recovery(dev, pci_channel_io_frozen, aer_root_reset);
+ pcie_do_fatal_recovery(dev, aer_root_reset);
pci_dev_put(dev);
}
@@ -985,11 +985,9 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity == AER_NONFATAL)
- pcie_do_recovery(pdev, pci_channel_io_normal,
- aer_root_reset);
+ pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
- pcie_do_recovery(pdev, pci_channel_io_frozen,
- aer_root_reset);
+ pcie_do_fatal_recovery(pdev, aer_root_reset);
pci_dev_put(pdev);
}
}
@@ -233,7 +233,7 @@ static irqreturn_t dpc_handler(int irq, void *context)
dpc_process_error(pdev);
/* We configure DPC so it only triggers on ERR_FATAL */
- pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link);
+ pcie_do_fatal_recovery(pdev, dpc_reset_link);
return IRQ_HANDLED;
}
@@ -183,7 +183,7 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
* or ERR_NONFATAL, since the link is already down, use the FATAL
* error recovery path for both cases.
*/
- estate = pcie_do_recovery(edev, pci_channel_io_frozen, dpc_reset_link);
+ estate = pcie_do_fatal_recovery(edev, dpc_reset_link);
send_ost:
@@ -79,11 +79,6 @@ static int report_error_detected(struct pci_dev *dev,
return 0;
}
-static int report_frozen_detected(struct pci_dev *dev, void *data)
-{
- return report_error_detected(dev, pci_channel_io_frozen, data);
-}
-
static int report_normal_detected(struct pci_dev *dev, void *data)
{
return report_error_detected(dev, pci_channel_io_normal, data);
@@ -146,9 +141,59 @@ static int report_resume(struct pci_dev *dev, void *data)
return 0;
}
-pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
- pci_channel_state_t state,
+static pci_ers_result_t pcie_do_fatal_recovery(struct pci_dev *dev,
pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
+{
+ struct pci_dev *udev;
+ struct pci_bus *parent;
+ struct pci_dev *pdev, *temp;
+ pci_ers_result_t result;
+
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+ udev = dev;
+ else
+ udev = dev->bus->self;
+
+ parent = udev->subordinate;
+ pci_walk_bus(parent, pci_dev_set_disconnected, NULL);
+
+ pci_lock_rescan_remove();
+ pci_dev_get(dev);
+ list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
+ bus_list) {
+ pci_stop_and_remove_bus_device(pdev);
+ }
+
+ result = reset_link(udev);
+
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ /*
+ * If the error is reported by a bridge, we think this error
+ * is related to the downstream link of the bridge, so we
+ * do error recovery on all subordinates of the bridge instead
+ * of the bridge and clear the error status of the bridge.
+ */
+ pci_aer_clear_fatal_status(dev);
+ if (pcie_aer_is_native(dev))
+ pcie_clear_device_status(dev);
+ }
+
+ if (result == PCI_ERS_RESULT_RECOVERED) {
+ if (pcie_wait_for_link(udev, true))
+ pci_rescan_bus(udev->bus);
+ pci_info(dev, "Device recovery from fatal error successful\n");
+ } else {
+ pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+ pci_info(dev, "Device recovery from fatal error failed\n");
+ }
+
+ pci_dev_put(dev);
+ pci_unlock_rescan_remove();
+
+ return result;
+}
+
+pci_ers_result_t pcie_do_nonfatal_recovery(struct pci_dev *dev)
{
pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
struct pci_bus *bus;
@@ -164,16 +209,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
bus = dev->subordinate;
pci_dbg(dev, "broadcast error_detected message\n");
- if (state == pci_channel_io_frozen) {
- pci_walk_bus(bus, report_frozen_detected, &status);
- status = reset_link(dev);
- if (status != PCI_ERS_RESULT_RECOVERED) {
- pci_warn(dev, "link reset failed\n");
- goto failed;
- }
- } else {
- pci_walk_bus(bus, report_normal_detected, &status);
- }
+ pci_walk_bus(bus, report_normal_detected, &status);
if (status == PCI_ERS_RESULT_CAN_RECOVER) {
status = PCI_ERS_RESULT_RECOVERED;