From patchwork Thu Jan 17 15:05:07 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eran Ben Elisha X-Patchwork-Id: 1026707 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming-netdev@ozlabs.org Delivered-To: patchwork-incoming-netdev@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=netdev-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=mellanox.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 43gS7l24W2z9sD9 for ; Fri, 18 Jan 2019 02:05:55 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728223AbfAQPFl (ORCPT ); Thu, 17 Jan 2019 10:05:41 -0500 Received: from mail-il-dmz.mellanox.com ([193.47.165.129]:55151 "EHLO mellanox.co.il" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1728069AbfAQPFj (ORCPT ); Thu, 17 Jan 2019 10:05:39 -0500 Received: from Internal Mail-Server by MTLPINE1 (envelope-from eranbe@mellanox.com) with ESMTPS (AES256-SHA encrypted); 17 Jan 2019 17:05:33 +0200 Received: from dev-l-vrt-198.mtl.labs.mlnx (dev-l-vrt-198.mtl.labs.mlnx [10.134.198.1]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id x0HF5WoM029214; Thu, 17 Jan 2019 17:05:33 +0200 From: Eran Ben Elisha To: netdev@vger.kernel.org, Jiri Pirko , "David S. Miller" , Ariel Almog , Aya Levin , Eran Ben Elisha , Moshe Shemesh Cc: Alex Vesker , Saeed Mahameed Subject: [PATCH net-next 13/27] net/mlx5: Add Crdump FW snapshot support Date: Thu, 17 Jan 2019 17:05:07 +0200 Message-Id: <1547737521-29888-14-git-send-email-eranbe@mellanox.com> X-Mailer: git-send-email 1.8.4.3 In-Reply-To: <1547737521-29888-1-git-send-email-eranbe@mellanox.com> References: <1547737521-29888-1-git-send-email-eranbe@mellanox.com> Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Alex Vesker Crdump allows the driver to create a snapshot of the FW PCI crspace. This is useful in case of catastrophic issues which require FW reset. The snapshot can be used for later debug. The snapshot is exposed using devlink, cr-space address regions are registered on init and snapshots are attached once a new snapshot is collected by the driver. Signed-off-by: Alex Vesker Signed-off-by: Moshe Shemesh Reviewed-by: Feras Daoud Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- .../ethernet/mellanox/mlx5/core/diag/crdump.c | 178 ++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/health.c | 1 + .../ethernet/mellanox/mlx5/core/lib/mlx5.h | 4 + .../net/ethernet/mellanox/mlx5/core/main.c | 5 + include/linux/mlx5/driver.h | 4 + 6 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 9a2144c0901b..9f0b96a6c915 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -16,7 +16,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \ fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \ - diag/fw_tracer.o devlink.o + diag/fw_tracer.o diag/crdump.o devlink.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c new file mode 100644 index 000000000000..18a413913b6e --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2018 Mellanox Technologies */ + +#include +#include +#include +#include "mlx5_core.h" +#include "lib/pci_vsc.h" + +#define BAD_ACCESS 0xBADACCE5 +#define MLX5_PROTECTED_CR_SCAN_CRSPACE 0x7 +#define MAX_NUM_OF_DUMPS_TO_STORE (8) + +static const char *region_cr_space_str = "cr-space"; + +struct mlx5_fw_crdump { + u32 size; + struct devlink_region *region_crspace; +}; + +bool mlx5_crdump_enbaled(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + + return (!!priv->health.crdump); +} + +static int mlx5_crdump_fill(struct mlx5_core_dev *dev, + char *crdump_region, u32 *snapshot_id) +{ + struct devlink *devlink = priv_to_devlink(dev); + struct mlx5_priv *priv = &dev->priv; + struct mlx5_fw_crdump *crdump = priv->health.crdump; + int i, ret = 0; + u32 *cr_data; + u32 id; + + cr_data = kvmalloc(crdump->size, GFP_KERNEL); + if (!cr_data) + return -ENOMEM; + + for (i = 0; i < (crdump->size / 4); i++) + cr_data[i] = BAD_ACCESS; + + ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump->size); + if (ret <= 0) { + if (ret == 0) + ret = -EIO; + goto free_data; + } + + if (crdump->size != ret) { + mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n", + ret, crdump->size); + ret = -EINVAL; + goto free_data; + } + + /* Get the available snapshot ID for the dumps */ + id = devlink_region_shapshot_id_get(devlink); + ret = devlink_region_snapshot_create(crdump->region_crspace, + crdump->size, (u8 *)cr_data, + id, &kvfree); + if (ret) { + mlx5_core_warn(dev, "crdump: devlink create %s snapshot id %d err %d\n", + region_cr_space_str, id, ret); + goto free_data; + } else { + *snapshot_id = id; + strcpy(crdump_region, region_cr_space_str); + } + return 0; + +free_data: + kvfree(cr_data); + return ret; +} + +int mlx5_crdump_collect(struct mlx5_core_dev *dev, + char *crdump_region, u32 *snapshot_id) +{ + int ret = 0; + + if (!mlx5_crdump_enbaled(dev)) + return -ENODEV; + + ret = mlx5_vsc_gw_lock(dev); + if (ret) { + mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n", + ret); + return ret; + } + + ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL); + if (ret) + goto unlock; + + ret = mlx5_crdump_fill(dev, crdump_region, snapshot_id); + +unlock: + mlx5_vsc_gw_unlock(dev); + return ret; +} + +int mlx5_crdump_init(struct mlx5_core_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(dev); + struct mlx5_priv *priv = &dev->priv; + struct mlx5_fw_crdump *crdump; + u32 space_size; + int ret; + + if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) || + mlx5_crdump_enbaled(dev)) + return 0; + + ret = mlx5_vsc_gw_lock(dev); + if (ret) + return ret; + + /* Check if space is supported and get space size */ + ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, + &space_size); + if (ret) { + /* Unlock and mask error since space is not supported */ + mlx5_vsc_gw_unlock(dev); + return 0; + } + + if (!space_size) { + mlx5_core_warn(dev, "Invalid Crspace size, zero\n"); + mlx5_vsc_gw_unlock(dev); + return -EINVAL; + } + + ret = mlx5_vsc_gw_unlock(dev); + if (ret) + return ret; + + crdump = kzalloc(sizeof(*crdump), GFP_KERNEL); + if (!crdump) + return -ENOMEM; + + /* Create cr-space region */ + crdump->size = space_size; + crdump->region_crspace = + devlink_region_create(devlink, + region_cr_space_str, + MAX_NUM_OF_DUMPS_TO_STORE, + space_size); + if (IS_ERR(crdump->region_crspace)) { + mlx5_core_warn(dev, + "crdump: create devlink region %s err %ld\n", + region_cr_space_str, + PTR_ERR(crdump->region_crspace)); + ret = PTR_ERR(crdump->region_crspace); + goto free_crdump; + } + priv->health.crdump = crdump; + return 0; + +free_crdump: + kfree(crdump); + return ret; +} + +void mlx5_crdump_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct mlx5_fw_crdump *crdump = priv->health.crdump; + + if (!crdump) + return; + + devlink_region_destroy(crdump->region_crspace); + kfree(crdump); + priv->health.crdump = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 196c07383082..883c1e8ffdc2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -378,6 +378,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); INIT_DELAYED_WORK(&health->recover_work, health_recover); + health->crdump = NULL; return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index 397a2847867a..3c9a6dedccaa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -41,6 +41,10 @@ int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count); void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count); int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index); void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index); +int mlx5_crdump_init(struct mlx5_core_dev *dev); +void mlx5_crdump_cleanup(struct mlx5_core_dev *dev); +int mlx5_crdump_collect(struct mlx5_core_dev *dev, + char *crdump_region, u32 *snapshot_id); /* TODO move to lib/events.h */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 124d6ccaf2f8..c6c07ca0e423 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1210,6 +1210,10 @@ static int init_one(struct pci_dev *pdev, if (err) goto clean_load; + err = mlx5_crdump_init(dev); + if (err) + dev_err(&pdev->dev, "mlx5_crdump_init failed with error code %d\n", err); + pci_save_state(pdev); return 0; @@ -1233,6 +1237,7 @@ static void remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(dev); struct mlx5_priv *priv = &dev->priv; + mlx5_crdump_cleanup(dev); mlx5_devlink_unregister(devlink); mlx5_unregister_device(dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e086f1fb5eea..77c04154ecf3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -53,6 +53,7 @@ #include #include #include +#include enum { MLX5_BOARD_ID_LEN = 64, @@ -425,6 +426,8 @@ struct mlx5_sq_bfreg { unsigned int offset; }; +struct mlx5_fw_crdump; + struct mlx5_core_health { struct health_buffer __iomem *health; __be32 __iomem *health_counter; @@ -438,6 +441,7 @@ struct mlx5_core_health { unsigned long flags; struct work_struct work; struct delayed_work recover_work; + struct mlx5_fw_crdump *crdump; }; struct mlx5_qp_table {