@@ -235,21 +235,52 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
.diagnose = mlx5_fw_reporter_diagnose,
};
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+static int
+mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
+ void *priv_ctx)
{
- struct devlink *devlink = priv_to_devlink(dev);
+ struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
- dev->fw_reporter = devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
- 0, false, dev);
- return PTR_ERR_OR_ZERO(dev->fw_reporter);
+ if (mlx5_sensor_pci_not_working(dev)) {
+ dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
+ return -ECANCELED;
+ }
+ dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+
+ mlx5_recover_device(dev);
+
+ return 0;
}
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
+ .name = "FW_fatal",
+ .recover = mlx5_fw_fatal_reporter_recover,
+};
+
+#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
{
- if (!dev->fw_reporter)
- return;
+ struct devlink *devlink = priv_to_devlink(dev);
- devlink_health_reporter_destroy(dev->fw_reporter);
+ dev->fw_reporter =
+ devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
+ 0, false, dev);
+ if (IS_ERR(dev->fw_reporter))
+ return PTR_ERR(dev->fw_reporter);
+
+ dev->fw_fatal_reporter =
+ devlink_health_reporter_create(devlink, &mlx5_fw_fatal_reporter_ops,
+ MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+ true, dev);
+ return PTR_ERR_OR_ZERO(dev->fw_fatal_reporter);
+}
+
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
+{
+ if (dev->fw_reporter)
+ devlink_health_reporter_destroy(dev->fw_reporter);
+ if (dev->fw_fatal_reporter)
+ devlink_health_reporter_destroy(dev->fw_fatal_reporter);
}
static int mlx5_devlink_get_crdump_snapshot(struct devlink *devlink, u32 id,
@@ -14,8 +14,8 @@ struct mlx5_fw_reporter_ctx {
int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
void mlx5_devlink_unregister(struct devlink *devlink);
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev);
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev);
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev);
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev);
void mlx5_fw_reporter_err_work(struct work_struct *work);
#endif /* __MLX5_DEVLINK_H__ */
@@ -91,7 +91,7 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
&dev->iseg->cmdq_addr_l_sz);
}
-static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
+bool mlx5_sensor_pci_not_working(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;
@@ -114,7 +114,7 @@ static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
{
- if (sensor_pci_not_working(dev))
+ if (mlx5_sensor_pci_not_working(dev))
return MLX5_SENSOR_PCI_COMM_ERR;
if (pci_channel_offline(dev->pdev))
return MLX5_SENSOR_PCI_ERR;
@@ -315,7 +315,7 @@ static void health_recover(struct work_struct *work)
priv = container_of(health, struct mlx5_priv, health);
dev = container_of(priv, struct mlx5_core_dev, priv);
- if (sensor_pci_not_working(dev)) {
+ if (mlx5_sensor_pci_not_working(dev)) {
dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
return;
}
@@ -980,9 +980,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
goto err_fw_tracer;
}
- err = mlx5_fw_reporter_create(dev);
+ err = mlx5_fw_reporters_create(dev);
if (err)
- dev_warn(&pdev->dev, "Failed to create FW reporter\n");
+ dev_warn(&pdev->dev, "Failed to create FW reporters\n");
err = mlx5_fpga_device_start(dev);
if (err) {
@@ -1116,7 +1116,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
mlx5_accel_ipsec_cleanup(dev);
mlx5_accel_tls_cleanup(dev);
mlx5_fpga_device_stop(dev);
- mlx5_fw_reporter_destroy(dev);
+ mlx5_fw_reporters_destroy(dev);
mlx5_fw_tracer_cleanup(dev->tracer);
mlx5_eq_table_destroy(dev);
mlx5_pagealloc_stop(dev);
@@ -204,6 +204,7 @@ enum {
u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state);
+bool mlx5_sensor_pci_not_working(struct mlx5_core_dev *dev);
#define HEALTH_INFO_MAX_LINE 64
#define HEALTH_INFO_LINES (MLX5_FLD_SZ_DW(health_buffer, assert_var) + 8)
@@ -686,6 +686,7 @@ struct mlx5_core_dev {
struct page *clock_info_page;
struct mlx5_fw_tracer *tracer;
struct devlink_health_reporter *fw_reporter;
+ struct devlink_health_reporter *fw_fatal_reporter;
u32 vsc_addr;
};