Message ID | 1452137792-24062-4-git-send-email-ruscur@russell.cc |
---|---|
State | Superseded |
Headers | show |
On Thu, 7 Jan 2016 14:36:31 Russell Currey wrote: > Enable NPU freeze and fence injection through debugfs. > > For example, if a NPU is PCI bus 8, a freeze on PE 0 can be injected with: > > echo 0:0:0:0:0 >> /sys/kernel/debug/powerpc/PCI0008/err_injct > > or a fence on PE 2 on PCI bus 9 with: > > echo 2:1:0:0:0 >> /sys/kernel/debug/powerpc/PCI0009/err_injct > > These will cause the appropriate EEH event to occur upon a DMA to the > NVLink. > > Signed-off-by: Russell Currey <ruscur@russell.cc> > --- > hw/npu.c | 31 ++++++++++++++++++++++++++++++- > 1 file changed, 30 insertions(+), 1 deletion(-) > > diff --git a/hw/npu.c b/hw/npu.c > index 9440f17..ba61d2d 100644 > --- a/hw/npu.c > +++ b/hw/npu.c > @@ -1016,6 +1016,35 @@ static int64_t npu_freeze_status(struct phb *phb __unused, > return OPAL_SUCCESS; > } > > +/* Sets the NPU to trigger an error when a DMA occurs */ > +static int64_t npu_err_inject(struct phb *phb, uint32_t pe_no, > + uint32_t type, uint32_t func __unused, > + uint64_t addr __unused, uint64_t mask __unused) > +{ > + struct npu *p = phb_to_npu(phb); > + struct npu_dev *dev; > + > + if (pe_no > NPU_NUM_OF_PES) { > + prlog(PR_ERR, "NPU: error injection failed, bad PE given\n"); > + return OPAL_SUCCESS; Shouldn't we return some kind of failure code rather than OPAL_SUCCESS? > + } > + > + dev = &p->devices[pe_no]; The assumption pe_no == device number doesn't always hold, particularly with the kernel patches to assign PE#s per GPU rather than per-link. We need to find the struct npu_dev by PE# rather than device number (we could add a pe_num field to struct npu_dev for example). > + > + /* TODO: extend this to conform to OPAL injection standards */ > + if (type > 1) { > + prlog(PR_ERR, "NPU: invalid error injection type\n"); > + } else if (type == 1) { > + /* Emulate fence mode. */ > + p->fenced = true; > + } else { > + /* Cause a freeze with an invalid MMIO write. */ > + in_be64((void *)dev->bar.base); > + } > + > + return OPAL_SUCCESS; > +} > + > static const struct phb_ops npu_ops = { > .lock = npu_lock, > .unlock = npu_unlock, > @@ -1055,7 +1084,7 @@ static const struct phb_ops npu_ops = { > .eeh_freeze_clear = NULL, > .eeh_freeze_set = NULL, > .next_error = NULL, > - .err_inject = NULL, > + .err_inject = npu_err_inject, > .get_diag_data = NULL, > .get_diag_data2 = NULL, > .set_capi_mode = NULL, >
diff --git a/hw/npu.c b/hw/npu.c index 9440f17..ba61d2d 100644 --- a/hw/npu.c +++ b/hw/npu.c @@ -1016,6 +1016,35 @@ static int64_t npu_freeze_status(struct phb *phb __unused, return OPAL_SUCCESS; } +/* Sets the NPU to trigger an error when a DMA occurs */ +static int64_t npu_err_inject(struct phb *phb, uint32_t pe_no, + uint32_t type, uint32_t func __unused, + uint64_t addr __unused, uint64_t mask __unused) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + + if (pe_no > NPU_NUM_OF_PES) { + prlog(PR_ERR, "NPU: error injection failed, bad PE given\n"); + return OPAL_SUCCESS; + } + + dev = &p->devices[pe_no]; + + /* TODO: extend this to conform to OPAL injection standards */ + if (type > 1) { + prlog(PR_ERR, "NPU: invalid error injection type\n"); + } else if (type == 1) { + /* Emulate fence mode. */ + p->fenced = true; + } else { + /* Cause a freeze with an invalid MMIO write. */ + in_be64((void *)dev->bar.base); + } + + return OPAL_SUCCESS; +} + static const struct phb_ops npu_ops = { .lock = npu_lock, .unlock = npu_unlock, @@ -1055,7 +1084,7 @@ static const struct phb_ops npu_ops = { .eeh_freeze_clear = NULL, .eeh_freeze_set = NULL, .next_error = NULL, - .err_inject = NULL, + .err_inject = npu_err_inject, .get_diag_data = NULL, .get_diag_data2 = NULL, .set_capi_mode = NULL,
Enable NPU freeze and fence injection through debugfs. For example, if a NPU is PCI bus 8, a freeze on PE 0 can be injected with: echo 0:0:0:0:0 >> /sys/kernel/debug/powerpc/PCI0008/err_injct or a fence on PE 2 on PCI bus 9 with: echo 2:1:0:0:0 >> /sys/kernel/debug/powerpc/PCI0009/err_injct These will cause the appropriate EEH event to occur upon a DMA to the NVLink. Signed-off-by: Russell Currey <ruscur@russell.cc> --- hw/npu.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-)