diff mbox

[4/5] nvlink: Add freeze and fence error injection

Message ID 1452137792-24062-4-git-send-email-ruscur@russell.cc
State Superseded
Headers show

Commit Message

Russell Currey Jan. 7, 2016, 3:36 a.m. UTC
Enable NPU freeze and fence injection through debugfs.

For example, if a NPU is PCI bus 8, a freeze on PE 0 can be injected with:

echo 0:0:0:0:0 >> /sys/kernel/debug/powerpc/PCI0008/err_injct

or a fence on PE 2 on PCI bus 9 with:

echo 2:1:0:0:0 >> /sys/kernel/debug/powerpc/PCI0009/err_injct

These will cause the appropriate EEH event to occur upon a DMA to the
NVLink.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 hw/npu.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

Comments

Alistair Popple Jan. 14, 2016, 3:14 a.m. UTC | #1
On Thu, 7 Jan 2016 14:36:31 Russell Currey wrote:
> Enable NPU freeze and fence injection through debugfs.
> 
> For example, if a NPU is PCI bus 8, a freeze on PE 0 can be injected with:
> 
> echo 0:0:0:0:0 >> /sys/kernel/debug/powerpc/PCI0008/err_injct
> 
> or a fence on PE 2 on PCI bus 9 with:
> 
> echo 2:1:0:0:0 >> /sys/kernel/debug/powerpc/PCI0009/err_injct
> 
> These will cause the appropriate EEH event to occur upon a DMA to the
> NVLink.
> 
> Signed-off-by: Russell Currey <ruscur@russell.cc>
> ---
>  hw/npu.c | 31 ++++++++++++++++++++++++++++++-
>  1 file changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/npu.c b/hw/npu.c
> index 9440f17..ba61d2d 100644
> --- a/hw/npu.c
> +++ b/hw/npu.c
> @@ -1016,6 +1016,35 @@ static int64_t npu_freeze_status(struct phb *phb __unused,
>  	return OPAL_SUCCESS;
>  }
>  
> +/* Sets the NPU to trigger an error when a DMA occurs */
> +static int64_t npu_err_inject(struct phb *phb, uint32_t pe_no,
> +			      uint32_t type, uint32_t func __unused,
> +			      uint64_t addr __unused, uint64_t mask __unused)
> +{
> +	struct npu *p = phb_to_npu(phb);
> +	struct npu_dev *dev;
> +
> +	if (pe_no > NPU_NUM_OF_PES) {
> +		prlog(PR_ERR, "NPU: error injection failed, bad PE given\n");
> +		return OPAL_SUCCESS;

Shouldn't we return some kind of failure code rather than OPAL_SUCCESS?

> +	}
> +
> +	dev = &p->devices[pe_no];

The assumption pe_no == device number doesn't always hold, particularly with
the kernel patches to assign PE#s per GPU rather than per-link. We need to
find the struct npu_dev by PE# rather than device number (we could add a
pe_num field to struct npu_dev for example).

> +
> +	/* TODO: extend this to conform to OPAL injection standards */
> +	if (type > 1) {
> +		prlog(PR_ERR, "NPU: invalid error injection type\n");
> +	} else if (type == 1) {
> +		/* Emulate fence mode. */
> +		p->fenced = true;
> +	} else {
> +		/* Cause a freeze with an invalid MMIO write. */
> +		in_be64((void *)dev->bar.base);
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
>  static const struct phb_ops npu_ops = {
>  	.lock			= npu_lock,
>  	.unlock			= npu_unlock,
> @@ -1055,7 +1084,7 @@ static const struct phb_ops npu_ops = {
>  	.eeh_freeze_clear	= NULL,
>  	.eeh_freeze_set		= NULL,
>  	.next_error		= NULL,
> -	.err_inject		= NULL,
> +	.err_inject		= npu_err_inject,
>  	.get_diag_data		= NULL,
>  	.get_diag_data2		= NULL,
>  	.set_capi_mode		= NULL,
>
diff mbox

Patch

diff --git a/hw/npu.c b/hw/npu.c
index 9440f17..ba61d2d 100644
--- a/hw/npu.c
+++ b/hw/npu.c
@@ -1016,6 +1016,35 @@  static int64_t npu_freeze_status(struct phb *phb __unused,
 	return OPAL_SUCCESS;
 }
 
+/* Sets the NPU to trigger an error when a DMA occurs */
+static int64_t npu_err_inject(struct phb *phb, uint32_t pe_no,
+			      uint32_t type, uint32_t func __unused,
+			      uint64_t addr __unused, uint64_t mask __unused)
+{
+	struct npu *p = phb_to_npu(phb);
+	struct npu_dev *dev;
+
+	if (pe_no > NPU_NUM_OF_PES) {
+		prlog(PR_ERR, "NPU: error injection failed, bad PE given\n");
+		return OPAL_SUCCESS;
+	}
+
+	dev = &p->devices[pe_no];
+
+	/* TODO: extend this to conform to OPAL injection standards */
+	if (type > 1) {
+		prlog(PR_ERR, "NPU: invalid error injection type\n");
+	} else if (type == 1) {
+		/* Emulate fence mode. */
+		p->fenced = true;
+	} else {
+		/* Cause a freeze with an invalid MMIO write. */
+		in_be64((void *)dev->bar.base);
+	}
+
+	return OPAL_SUCCESS;
+}
+
 static const struct phb_ops npu_ops = {
 	.lock			= npu_lock,
 	.unlock			= npu_unlock,
@@ -1055,7 +1084,7 @@  static const struct phb_ops npu_ops = {
 	.eeh_freeze_clear	= NULL,
 	.eeh_freeze_set		= NULL,
 	.next_error		= NULL,
-	.err_inject		= NULL,
+	.err_inject		= npu_err_inject,
 	.get_diag_data		= NULL,
 	.get_diag_data2		= NULL,
 	.set_capi_mode		= NULL,