From patchwork Thu Jan 10 10:29:02 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Saeed Mahameed X-Patchwork-Id: 1022845 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming-netdev@ozlabs.org Delivered-To: patchwork-incoming-netdev@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=netdev-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=mellanox.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 43b2LR56wtz9sMQ for ; Thu, 10 Jan 2019 21:29:51 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728193AbfAJK3u (ORCPT ); Thu, 10 Jan 2019 05:29:50 -0500 Received: from mail-il-dmz.mellanox.com ([193.47.165.129]:41005 "EHLO mellanox.co.il" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1726255AbfAJK3u (ORCPT ); Thu, 10 Jan 2019 05:29:50 -0500 Received: from Internal Mail-Server by MTLPINE1 (envelope-from saeedm@mellanox.com) with ESMTPS (AES256-SHA encrypted); 10 Jan 2019 12:29:43 +0200 Received: from sx1.mtl.com ([172.16.5.7]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id x0AATDGK031830; Thu, 10 Jan 2019 12:29:39 +0200 From: Saeed Mahameed To: "David S. Miller" Cc: netdev@vger.kernel.org, Feras Daoud , Alex Vesker , Daniel Jurgens , Saeed Mahameed Subject: [net-next 6/9] net/mlx5: Handle SW reset of FW in error flow Date: Thu, 10 Jan 2019 12:29:02 +0200 Message-Id: <20190110102906.3751-7-saeedm@mellanox.com> X-Mailer: git-send-email 2.20.1 In-Reply-To: <20190110102906.3751-1-saeedm@mellanox.com> References: <20190110102906.3751-1-saeedm@mellanox.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Feras Daoud New mlx5 adapters allow the driver to reset the FW in the event of an error, this action called "SW Reset". When an SW reset is issued on any PF all PFs enter reset state which is a recoverable condition. The existing recovery flow was designed to allow the recovery of a VF after a PF driver reload. This patch adds the sw reset to the NIC states as a preparation for sw reset handling. When a software reset is issued the following occurs: 1. The NIC interface mode is set to 7 while the reset is in progress. 2. Once the reset completes the NIC interface mode is set to 1. Signed-off-by: Alex Vesker Signed-off-by: Daniel Jurgens Reviewed-by: Feras Daoud Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_selftest.c | 2 +- .../net/ethernet/mellanox/mlx5/core/health.c | 57 +++++++++++++------ .../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +- include/linux/mlx5/driver.h | 2 +- 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 4382ef85488c..840ec945ccba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -64,7 +64,7 @@ static int mlx5e_test_health_info(struct mlx5e_priv *priv) { struct mlx5_core_health *health = &priv->mdev->priv.health; - return health->sick ? 1 : 0; + return health->fatal_error ? 1 : 0; } static int mlx5e_test_link_state(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 883c1e8ffdc2..afa3fe6eef8f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -65,9 +65,16 @@ enum { MLX5_DROP_NEW_RECOVERY_WORK, }; +enum { + MLX5_SENSOR_NO_ERR = 0, + MLX5_SENSOR_PCI_COMM_ERR = 1, + MLX5_SENSOR_NIC_DISABLED = 2, + MLX5_SENSOR_NIC_SW_RESET = 3, +}; + u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) { - return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; + return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; } void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) @@ -80,18 +87,25 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) &dev->iseg->cmdq_addr_l_sz); } -static int in_fatal(struct mlx5_core_dev *dev) +static bool sensor_pci_not_working(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; struct health_buffer __iomem *h = health->health; - if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) - return 1; + /* Offline PCI reads return 0xffffffff */ + return (ioread32be(&h->fw_ver) == 0xffffffff); +} - if (ioread32be(&h->fw_ver) == 0xffffffff) - return 1; +static u32 check_fatal_sensors(struct mlx5_core_dev *dev) +{ + if (sensor_pci_not_working(dev)) + return MLX5_SENSOR_PCI_COMM_ERR; + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) + return MLX5_SENSOR_NIC_DISABLED; + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET) + return MLX5_SENSOR_NIC_SW_RESET; - return 0; + return MLX5_SENSOR_NO_ERR; } void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) @@ -101,7 +115,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) goto unlock; mlx5_core_err(dev, "start\n"); - if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) { + if (pci_channel_offline(dev->pdev) || + dev->priv.health.fatal_error != MLX5_SENSOR_NO_ERR || force) { dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mlx5_cmd_trigger_completions(dev); } @@ -143,16 +158,14 @@ static void health_recover(struct work_struct *work) struct delayed_work *dwork; struct mlx5_core_dev *dev; struct mlx5_priv *priv; - u8 nic_state; dwork = container_of(work, struct delayed_work, work); health = container_of(dwork, struct mlx5_core_health, recover_work); priv = container_of(health, struct mlx5_priv, health); dev = container_of(priv, struct mlx5_core_dev, priv); - nic_state = mlx5_get_nic_state(dev); - if (nic_state == MLX5_NIC_IFC_INVALID) { - dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n"); + if (sensor_pci_not_working(dev)) { + dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n"); return; } @@ -162,10 +175,17 @@ static void health_recover(struct work_struct *work) /* How much time to wait until health resetting the driver (in msecs) */ #define MLX5_RECOVERY_DELAY_MSECS 60000 +#define MLX5_RECOVERY_NO_DELAY 0 +static unsigned long get_recovery_delay(struct mlx5_core_dev *dev) +{ + return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR ? + MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY; +} + static void health_care(struct work_struct *work) { - unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS); struct mlx5_core_health *health; + unsigned long recover_delay; struct mlx5_core_dev *dev; struct mlx5_priv *priv; unsigned long flags; @@ -175,6 +195,7 @@ static void health_care(struct work_struct *work) dev = container_of(priv, struct mlx5_core_dev, priv); mlx5_core_warn(dev, "handling bad device here\n"); mlx5_handle_bad_state(dev); + recover_delay = msecs_to_jiffies(get_recovery_delay(dev)); spin_lock_irqsave(&health->wq_lock, flags); if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) @@ -271,6 +292,7 @@ static void poll_health(struct timer_list *t) { struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); struct mlx5_core_health *health = &dev->priv.health; + u32 fatal_error; u32 count; if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) @@ -288,8 +310,11 @@ static void poll_health(struct timer_list *t) print_health_info(dev); } - if (in_fatal(dev) && !health->sick) { - health->sick = true; + fatal_error = check_fatal_sensors(dev); + + if (fatal_error && !health->fatal_error) { + mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); + dev->priv.health.fatal_error = fatal_error; print_health_info(dev); mlx5_trigger_health_work(dev); } @@ -303,7 +328,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) struct mlx5_core_health *health = &dev->priv.health; timer_setup(&health->timer, poll_health, 0); - health->sick = 0; + health->fatal_error = MLX5_SENSOR_NO_ERR; clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); health->health = &dev->iseg->health; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index c68dcea5985b..29a2a4546fb6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -194,7 +194,7 @@ enum { MLX5_NIC_IFC_FULL = 0, MLX5_NIC_IFC_DISABLED = 1, MLX5_NIC_IFC_NO_DRAM_NIC = 2, - MLX5_NIC_IFC_INVALID = 3 + MLX5_NIC_IFC_SW_RESET = 7 }; u8 mlx5_get_nic_state(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2f41603dc728..71cb52bae1ae 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -434,7 +434,7 @@ struct mlx5_core_health { struct timer_list timer; u32 prev; int miss_counter; - bool sick; + u32 fatal_error; /* wq spinlock to synchronize draining */ spinlock_t wq_lock; struct workqueue_struct *wq;