From patchwork Thu Jan 17 15:05:10 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eran Ben Elisha X-Patchwork-Id: 1026713 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming-netdev@ozlabs.org Delivered-To: patchwork-incoming-netdev@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=netdev-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=mellanox.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 43gS7y2HWrz9sCh for ; Fri, 18 Jan 2019 02:06:06 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728201AbfAQPFk (ORCPT ); Thu, 17 Jan 2019 10:05:40 -0500 Received: from mail-il-dmz.mellanox.com ([193.47.165.129]:55167 "EHLO mellanox.co.il" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1728071AbfAQPFj (ORCPT ); Thu, 17 Jan 2019 10:05:39 -0500 Received: from Internal Mail-Server by MTLPINE1 (envelope-from eranbe@mellanox.com) with ESMTPS (AES256-SHA encrypted); 17 Jan 2019 17:05:33 +0200 Received: from dev-l-vrt-198.mtl.labs.mlnx (dev-l-vrt-198.mtl.labs.mlnx [10.134.198.1]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id x0HF5WoP029214; Thu, 17 Jan 2019 17:05:33 +0200 From: Eran Ben Elisha To: netdev@vger.kernel.org, Jiri Pirko , "David S. Miller" , Ariel Almog , Aya Levin , Eran Ben Elisha , Moshe Shemesh Cc: Feras Daoud , Daniel Jurgens , Saeed Mahameed Subject: [PATCH net-next 16/27] net/mlx5: Handle SW reset of FW in error flow Date: Thu, 17 Jan 2019 17:05:10 +0200 Message-Id: <1547737521-29888-17-git-send-email-eranbe@mellanox.com> X-Mailer: git-send-email 1.8.4.3 In-Reply-To: <1547737521-29888-1-git-send-email-eranbe@mellanox.com> References: <1547737521-29888-1-git-send-email-eranbe@mellanox.com> Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Feras Daoud New mlx5 adapters allow the driver to reset the FW in the event of an error, this action called "SW Reset". When an SW reset is issued on any PF all PFs enter reset state which is a recoverable condition. The existing recovery flow was designed to allow the recovery of a VF after a PF driver reload. This patch adds the sw reset to the NIC states as a preparation for sw reset handling. When a software reset is issued the following occurs: 1. The NIC interface mode is set to 7 while the reset is in progress. 2. Once the reset completes the NIC interface mode is set to 1. Signed-off-by: Feras Daoud Signed-off-by: Daniel Jurgens Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_selftest.c | 2 +- .../net/ethernet/mellanox/mlx5/core/health.c | 57 +++++++++++++------ .../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +- include/linux/mlx5/driver.h | 2 +- 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 4382ef85488c..840ec945ccba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -64,7 +64,7 @@ static int mlx5e_test_health_info(struct mlx5e_priv *priv) { struct mlx5_core_health *health = &priv->mdev->priv.health; - return health->sick ? 1 : 0; + return health->fatal_error ? 1 : 0; } static int mlx5e_test_link_state(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 883c1e8ffdc2..afa3fe6eef8f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -65,9 +65,16 @@ enum { MLX5_DROP_NEW_RECOVERY_WORK, }; +enum { + MLX5_SENSOR_NO_ERR = 0, + MLX5_SENSOR_PCI_COMM_ERR = 1, + MLX5_SENSOR_NIC_DISABLED = 2, + MLX5_SENSOR_NIC_SW_RESET = 3, +}; + u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) { - return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; + return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; } void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) @@ -80,18 +87,25 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) &dev->iseg->cmdq_addr_l_sz); } -static int in_fatal(struct mlx5_core_dev *dev) +static bool sensor_pci_not_working(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; struct health_buffer __iomem *h = health->health; - if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) - return 1; + /* Offline PCI reads return 0xffffffff */ + return (ioread32be(&h->fw_ver) == 0xffffffff); +} - if (ioread32be(&h->fw_ver) == 0xffffffff) - return 1; +static u32 check_fatal_sensors(struct mlx5_core_dev *dev) +{ + if (sensor_pci_not_working(dev)) + return MLX5_SENSOR_PCI_COMM_ERR; + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) + return MLX5_SENSOR_NIC_DISABLED; + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET) + return MLX5_SENSOR_NIC_SW_RESET; - return 0; + return MLX5_SENSOR_NO_ERR; } void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) @@ -101,7 +115,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) goto unlock; mlx5_core_err(dev, "start\n"); - if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) { + if (pci_channel_offline(dev->pdev) || + dev->priv.health.fatal_error != MLX5_SENSOR_NO_ERR || force) { dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mlx5_cmd_trigger_completions(dev); } @@ -143,16 +158,14 @@ static void health_recover(struct work_struct *work) struct delayed_work *dwork; struct mlx5_core_dev *dev; struct mlx5_priv *priv; - u8 nic_state; dwork = container_of(work, struct delayed_work, work); health = container_of(dwork, struct mlx5_core_health, recover_work); priv = container_of(health, struct mlx5_priv, health); dev = container_of(priv, struct mlx5_core_dev, priv); - nic_state = mlx5_get_nic_state(dev); - if (nic_state == MLX5_NIC_IFC_INVALID) { - dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n"); + if (sensor_pci_not_working(dev)) { + dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n"); return; } @@ -162,10 +175,17 @@ static void health_recover(struct work_struct *work) /* How much time to wait until health resetting the driver (in msecs) */ #define MLX5_RECOVERY_DELAY_MSECS 60000 +#define MLX5_RECOVERY_NO_DELAY 0 +static unsigned long get_recovery_delay(struct mlx5_core_dev *dev) +{ + return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR ? + MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY; +} + static void health_care(struct work_struct *work) { - unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS); struct mlx5_core_health *health; + unsigned long recover_delay; struct mlx5_core_dev *dev; struct mlx5_priv *priv; unsigned long flags; @@ -175,6 +195,7 @@ static void health_care(struct work_struct *work) dev = container_of(priv, struct mlx5_core_dev, priv); mlx5_core_warn(dev, "handling bad device here\n"); mlx5_handle_bad_state(dev); + recover_delay = msecs_to_jiffies(get_recovery_delay(dev)); spin_lock_irqsave(&health->wq_lock, flags); if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) @@ -271,6 +292,7 @@ static void poll_health(struct timer_list *t) { struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); struct mlx5_core_health *health = &dev->priv.health; + u32 fatal_error; u32 count; if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) @@ -288,8 +310,11 @@ static void poll_health(struct timer_list *t) print_health_info(dev); } - if (in_fatal(dev) && !health->sick) { - health->sick = true; + fatal_error = check_fatal_sensors(dev); + + if (fatal_error && !health->fatal_error) { + mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); + dev->priv.health.fatal_error = fatal_error; print_health_info(dev); mlx5_trigger_health_work(dev); } @@ -303,7 +328,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) struct mlx5_core_health *health = &dev->priv.health; timer_setup(&health->timer, poll_health, 0); - health->sick = 0; + health->fatal_error = MLX5_SENSOR_NO_ERR; clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); health->health = &dev->iseg->health; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index c68dcea5985b..29a2a4546fb6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -194,7 +194,7 @@ enum { MLX5_NIC_IFC_FULL = 0, MLX5_NIC_IFC_DISABLED = 1, MLX5_NIC_IFC_NO_DRAM_NIC = 2, - MLX5_NIC_IFC_INVALID = 3 + MLX5_NIC_IFC_SW_RESET = 7 }; u8 mlx5_get_nic_state(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 77c04154ecf3..ac138180f08b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -434,7 +434,7 @@ struct mlx5_core_health { struct timer_list timer; u32 prev; int miss_counter; - bool sick; + u32 fatal_error; /* wq spinlock to synchronize draining */ spinlock_t wq_lock; struct workqueue_struct *wq;