@@ -234,21 +234,54 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
.diagnose = mlx5_fw_reporter_diagnose,
};
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+static int
+mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
+ void *priv_ctx)
{
- struct devlink *devlink = priv_to_devlink(dev);
+ struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+ u8 nic_state;
- dev->fw_reporter = devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
- 0, false, dev);
- return PTR_ERR_OR_ZERO(dev->fw_reporter);
+ nic_state = mlx5_get_nic_state(dev);
+ if (nic_state == MLX5_NIC_IFC_INVALID) {
+ dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
+ return -ECANCELED;
+ }
+ dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+
+ mlx5_recover_device(dev);
+
+ return 0;
}
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
+ .name = "FW_fatal",
+ .recover = mlx5_fw_fatal_reporter_recover,
+};
+
+#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 120000
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
{
- if (!dev->fw_reporter)
- return;
+ struct devlink *devlink = priv_to_devlink(dev);
- devlink_health_reporter_destroy(dev->fw_reporter);
+ dev->fw_reporter =
+ devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
+ 0, false, dev);
+ if (IS_ERR(dev->fw_reporter))
+ return PTR_ERR(dev->fw_reporter);
+
+ dev->fw_fatal_reporter =
+ devlink_health_reporter_create(devlink, &mlx5_fw_fatal_reporter_ops,
+ MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+ true, dev);
+ return PTR_ERR_OR_ZERO(dev->fw_fatal_reporter);
+}
+
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
+{
+ if (dev->fw_reporter)
+ devlink_health_reporter_destroy(dev->fw_reporter);
+ if (dev->fw_fatal_reporter)
+ devlink_health_reporter_destroy(dev->fw_fatal_reporter);
}
int mlx5_devlink_register(struct devlink *devlink, struct device *dev)
@@ -14,8 +14,8 @@ struct mlx5_fw_reporter_ctx {
int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
void mlx5_devlink_unregister(struct devlink *devlink);
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev);
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev);
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev);
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev);
void mlx5_fw_reporter_err_work(struct work_struct *work);
#endif /* __MLX5_DEVLINK_H__ */
@@ -981,9 +981,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
goto err_fw_tracer;
}
- err = mlx5_fw_reporter_create(dev);
+ err = mlx5_fw_reporters_create(dev);
if (err)
- dev_warn(&pdev->dev, "Failed to create FW reporter\n");
+ dev_warn(&pdev->dev, "Failed to create FW reporters\n");
err = mlx5_fpga_device_start(dev);
if (err) {
@@ -1117,7 +1117,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
mlx5_accel_ipsec_cleanup(dev);
mlx5_accel_tls_cleanup(dev);
mlx5_fpga_device_stop(dev);
- mlx5_fw_reporter_destroy(dev);
+ mlx5_fw_reporters_destroy(dev);
mlx5_fw_tracer_cleanup(dev->tracer);
mlx5_eq_table_destroy(dev);
mlx5_pagealloc_stop(dev);
@@ -681,6 +681,7 @@ struct mlx5_core_dev {
struct page *clock_info_page;
struct mlx5_fw_tracer *tracer;
struct devlink_health_reporter *fw_reporter;
+ struct devlink_health_reporter *fw_fatal_reporter;
};
struct mlx5_db {