diff mbox series

PCI: vmd: Delay interrupt handling on MTL VMD controller

Message ID 20240903025544.286223-1-kai.heng.feng@canonical.com
State New
Headers show
Series PCI: vmd: Delay interrupt handling on MTL VMD controller | expand

Commit Message

Kai-Heng Feng Sept. 3, 2024, 2:55 a.m. UTC
Meteor Lake VMD has a bug that the IRQ raises before the DMA region is
ready, so the requested IO is considered never completed:
[   97.343423] nvme nvme0: I/O 259 QID 2 timeout, completion polled
[   97.343446] nvme nvme0: I/O 384 QID 3 timeout, completion polled
[   97.343459] nvme nvme0: I/O 320 QID 4 timeout, completion polled
[   97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion polled

The is documented as erratum MTL016 [0]. The suggested workaround is to
"The VMD MSI interrupt-handler should initially perform a dummy register
read to the MSI initiator device prior to any writes to ensure proper
PCIe ordering." which essentially is adding a delay before the interrupt
handling.

Hence add a delay before handle interrupt to workaround the erratum.

[0] https://edc.intel.com/content/www/us/en/design/products/platforms/details/meteor-lake-u-p/core-ultra-processor-specification-update/errata-details/#MTL016

Link: https://bugzilla.kernel.org/show_bug.cgi?id=217871
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
 drivers/pci/controller/vmd.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

Comments

Manivannan Sadhasivam Sept. 3, 2024, 4:28 a.m. UTC | #1
On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng wrote:
> Meteor Lake VMD has a bug that the IRQ raises before the DMA region is
> ready, so the requested IO is considered never completed:
> [   97.343423] nvme nvme0: I/O 259 QID 2 timeout, completion polled
> [   97.343446] nvme nvme0: I/O 384 QID 3 timeout, completion polled
> [   97.343459] nvme nvme0: I/O 320 QID 4 timeout, completion polled
> [   97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion polled
> 
> The is documented as erratum MTL016 [0]. The suggested workaround is to
> "The VMD MSI interrupt-handler should initially perform a dummy register
> read to the MSI initiator device prior to any writes to ensure proper
> PCIe ordering." which essentially is adding a delay before the interrupt
> handling.
> 

Why can't you add a dummy register read instead? Adding a delay for PCIe
ordering is not going to work always.

> Hence add a delay before handle interrupt to workaround the erratum.
> 
> [0] https://edc.intel.com/content/www/us/en/design/products/platforms/details/meteor-lake-u-p/core-ultra-processor-specification-update/errata-details/#MTL016
> 
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=217871
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
>  drivers/pci/controller/vmd.c | 18 ++++++++++++++++--
>  1 file changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
> index a726de0af011..3433b3730f9c 100644
> --- a/drivers/pci/controller/vmd.c
> +++ b/drivers/pci/controller/vmd.c
> @@ -16,6 +16,7 @@
>  #include <linux/srcu.h>
>  #include <linux/rculist.h>
>  #include <linux/rcupdate.h>
> +#include <linux/delay.h>
>  
>  #include <asm/irqdomain.h>
>  
> @@ -74,6 +75,9 @@ enum vmd_features {
>  	 * proper power management of the SoC.
>  	 */
>  	VMD_FEAT_BIOS_PM_QUIRK		= (1 << 5),
> +
> +	/* Erratum MTL016 */
> +	VMD_FEAT_INTERRUPT_QUIRK	= (1 << 6),
>  };
>  
>  #define VMD_BIOS_PM_QUIRK_LTR	0x1003	/* 3145728 ns */
> @@ -90,6 +94,8 @@ static DEFINE_IDA(vmd_instance_ida);
>   */
>  static DEFINE_RAW_SPINLOCK(list_lock);
>  
> +static bool interrupt_delay;
> +
>  /**
>   * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
>   * @node:	list item for parent traversal.
> @@ -105,6 +111,7 @@ struct vmd_irq {
>  	struct vmd_irq_list	*irq;
>  	bool			enabled;
>  	unsigned int		virq;
> +	bool			delay_irq;

This is unused. Perhaps you wanted to use this instead of interrupt_delay?

- Mani

>  };
>  
>  /**
> @@ -680,8 +687,11 @@ static irqreturn_t vmd_irq(int irq, void *data)
>  	int idx;
>  
>  	idx = srcu_read_lock(&irqs->srcu);
> -	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
> +	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node) {
> +		if (interrupt_delay)
> +			udelay(4);
>  		generic_handle_irq(vmdirq->virq);
> +	}
>  	srcu_read_unlock(&irqs->srcu, idx);
>  
>  	return IRQ_HANDLED;
> @@ -1015,6 +1025,9 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>  	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
>  		vmd->first_vec = 1;
>  
> +	if (features & VMD_FEAT_INTERRUPT_QUIRK)
> +		interrupt_delay = true;
> +
>  	spin_lock_init(&vmd->cfg_lock);
>  	pci_set_drvdata(dev, vmd);
>  	err = vmd_enable_domain(vmd, features);
> @@ -1106,7 +1119,8 @@ static const struct pci_device_id vmd_ids[] = {
>  	{PCI_VDEVICE(INTEL, 0xa77f),
>  		.driver_data = VMD_FEATS_CLIENT,},
>  	{PCI_VDEVICE(INTEL, 0x7d0b),
> -		.driver_data = VMD_FEATS_CLIENT,},
> +		.driver_data = VMD_FEATS_CLIENT |
> +			       VMD_FEAT_INTERRUPT_QUIRK,},
>  	{PCI_VDEVICE(INTEL, 0xad0b),
>  		.driver_data = VMD_FEATS_CLIENT,},
>  	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
> -- 
> 2.43.0
>
Kai-Heng Feng Sept. 3, 2024, 7:07 a.m. UTC | #2
On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
<manivannan.sadhasivam@linaro.org> wrote:
>
> On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng wrote:
> > Meteor Lake VMD has a bug that the IRQ raises before the DMA region is
> > ready, so the requested IO is considered never completed:
> > [   97.343423] nvme nvme0: I/O 259 QID 2 timeout, completion polled
> > [   97.343446] nvme nvme0: I/O 384 QID 3 timeout, completion polled
> > [   97.343459] nvme nvme0: I/O 320 QID 4 timeout, completion polled
> > [   97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion polled
> >
> > The is documented as erratum MTL016 [0]. The suggested workaround is to
> > "The VMD MSI interrupt-handler should initially perform a dummy register
> > read to the MSI initiator device prior to any writes to ensure proper
> > PCIe ordering." which essentially is adding a delay before the interrupt
> > handling.
> >
>
> Why can't you add a dummy register read instead? Adding a delay for PCIe
> ordering is not going to work always.

This can be done too. But it can take longer than 4us delay, so I'd
like to keep it a bit faster here.

>
> > Hence add a delay before handle interrupt to workaround the erratum.
> >
> > [0] https://edc.intel.com/content/www/us/en/design/products/platforms/details/meteor-lake-u-p/core-ultra-processor-specification-update/errata-details/#MTL016
> >
> > Link: https://bugzilla.kernel.org/show_bug.cgi?id=217871
> > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> > ---
> >  drivers/pci/controller/vmd.c | 18 ++++++++++++++++--
> >  1 file changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
> > index a726de0af011..3433b3730f9c 100644
> > --- a/drivers/pci/controller/vmd.c
> > +++ b/drivers/pci/controller/vmd.c
> > @@ -16,6 +16,7 @@
> >  #include <linux/srcu.h>
> >  #include <linux/rculist.h>
> >  #include <linux/rcupdate.h>
> > +#include <linux/delay.h>
> >
> >  #include <asm/irqdomain.h>
> >
> > @@ -74,6 +75,9 @@ enum vmd_features {
> >        * proper power management of the SoC.
> >        */
> >       VMD_FEAT_BIOS_PM_QUIRK          = (1 << 5),
> > +
> > +     /* Erratum MTL016 */
> > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> >  };
> >
> >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728 ns */
> > @@ -90,6 +94,8 @@ static DEFINE_IDA(vmd_instance_ida);
> >   */
> >  static DEFINE_RAW_SPINLOCK(list_lock);
> >
> > +static bool interrupt_delay;
> > +
> >  /**
> >   * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
> >   * @node:    list item for parent traversal.
> > @@ -105,6 +111,7 @@ struct vmd_irq {
> >       struct vmd_irq_list     *irq;
> >       bool                    enabled;
> >       unsigned int            virq;
> > +     bool                    delay_irq;
>
> This is unused. Perhaps you wanted to use this instead of interrupt_delay?

This is leftover, will scratch this.

Kai-Heng

>
> - Mani
>
> >  };
> >
> >  /**
> > @@ -680,8 +687,11 @@ static irqreturn_t vmd_irq(int irq, void *data)
> >       int idx;
> >
> >       idx = srcu_read_lock(&irqs->srcu);
> > -     list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
> > +     list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node) {
> > +             if (interrupt_delay)
> > +                     udelay(4);
> >               generic_handle_irq(vmdirq->virq);
> > +     }
> >       srcu_read_unlock(&irqs->srcu, idx);
> >
> >       return IRQ_HANDLED;
> > @@ -1015,6 +1025,9 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
> >       if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
> >               vmd->first_vec = 1;
> >
> > +     if (features & VMD_FEAT_INTERRUPT_QUIRK)
> > +             interrupt_delay = true;
> > +
> >       spin_lock_init(&vmd->cfg_lock);
> >       pci_set_drvdata(dev, vmd);
> >       err = vmd_enable_domain(vmd, features);
> > @@ -1106,7 +1119,8 @@ static const struct pci_device_id vmd_ids[] = {
> >       {PCI_VDEVICE(INTEL, 0xa77f),
> >               .driver_data = VMD_FEATS_CLIENT,},
> >       {PCI_VDEVICE(INTEL, 0x7d0b),
> > -             .driver_data = VMD_FEATS_CLIENT,},
> > +             .driver_data = VMD_FEATS_CLIENT |
> > +                            VMD_FEAT_INTERRUPT_QUIRK,},
> >       {PCI_VDEVICE(INTEL, 0xad0b),
> >               .driver_data = VMD_FEATS_CLIENT,},
> >       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
> > --
> > 2.43.0
> >
>
> --
> மணிவண்ணன் சதாசிவம்
Manivannan Sadhasivam Sept. 3, 2024, 8 a.m. UTC | #3
On Tue, Sep 03, 2024 at 03:07:45PM +0800, Kai-Heng Feng wrote:
> On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> <manivannan.sadhasivam@linaro.org> wrote:
> >
> > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng wrote:
> > > Meteor Lake VMD has a bug that the IRQ raises before the DMA region is
> > > ready, so the requested IO is considered never completed:
> > > [   97.343423] nvme nvme0: I/O 259 QID 2 timeout, completion polled
> > > [   97.343446] nvme nvme0: I/O 384 QID 3 timeout, completion polled
> > > [   97.343459] nvme nvme0: I/O 320 QID 4 timeout, completion polled
> > > [   97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion polled
> > >
> > > The is documented as erratum MTL016 [0]. The suggested workaround is to
> > > "The VMD MSI interrupt-handler should initially perform a dummy register
> > > read to the MSI initiator device prior to any writes to ensure proper
> > > PCIe ordering." which essentially is adding a delay before the interrupt
> > > handling.
> > >
> >
> > Why can't you add a dummy register read instead? Adding a delay for PCIe
> > ordering is not going to work always.
> 
> This can be done too. But it can take longer than 4us delay, so I'd
> like to keep it a bit faster here.
> 

No. As I said, this delay is not going to work all the time and highly
unreliable. Please use the register read as suggested.

- Mani

> >
> > > Hence add a delay before handle interrupt to workaround the erratum.
> > >
> > > [0] https://edc.intel.com/content/www/us/en/design/products/platforms/details/meteor-lake-u-p/core-ultra-processor-specification-update/errata-details/#MTL016
> > >
> > > Link: https://bugzilla.kernel.org/show_bug.cgi?id=217871
> > > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> > > ---
> > >  drivers/pci/controller/vmd.c | 18 ++++++++++++++++--
> > >  1 file changed, 16 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
> > > index a726de0af011..3433b3730f9c 100644
> > > --- a/drivers/pci/controller/vmd.c
> > > +++ b/drivers/pci/controller/vmd.c
> > > @@ -16,6 +16,7 @@
> > >  #include <linux/srcu.h>
> > >  #include <linux/rculist.h>
> > >  #include <linux/rcupdate.h>
> > > +#include <linux/delay.h>
> > >
> > >  #include <asm/irqdomain.h>
> > >
> > > @@ -74,6 +75,9 @@ enum vmd_features {
> > >        * proper power management of the SoC.
> > >        */
> > >       VMD_FEAT_BIOS_PM_QUIRK          = (1 << 5),
> > > +
> > > +     /* Erratum MTL016 */
> > > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> > >  };
> > >
> > >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728 ns */
> > > @@ -90,6 +94,8 @@ static DEFINE_IDA(vmd_instance_ida);
> > >   */
> > >  static DEFINE_RAW_SPINLOCK(list_lock);
> > >
> > > +static bool interrupt_delay;
> > > +
> > >  /**
> > >   * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
> > >   * @node:    list item for parent traversal.
> > > @@ -105,6 +111,7 @@ struct vmd_irq {
> > >       struct vmd_irq_list     *irq;
> > >       bool                    enabled;
> > >       unsigned int            virq;
> > > +     bool                    delay_irq;
> >
> > This is unused. Perhaps you wanted to use this instead of interrupt_delay?
> 
> This is leftover, will scratch this.
> 
> Kai-Heng
> 
> >
> > - Mani
> >
> > >  };
> > >
> > >  /**
> > > @@ -680,8 +687,11 @@ static irqreturn_t vmd_irq(int irq, void *data)
> > >       int idx;
> > >
> > >       idx = srcu_read_lock(&irqs->srcu);
> > > -     list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
> > > +     list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node) {
> > > +             if (interrupt_delay)
> > > +                     udelay(4);
> > >               generic_handle_irq(vmdirq->virq);
> > > +     }
> > >       srcu_read_unlock(&irqs->srcu, idx);
> > >
> > >       return IRQ_HANDLED;
> > > @@ -1015,6 +1025,9 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
> > >       if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
> > >               vmd->first_vec = 1;
> > >
> > > +     if (features & VMD_FEAT_INTERRUPT_QUIRK)
> > > +             interrupt_delay = true;
> > > +
> > >       spin_lock_init(&vmd->cfg_lock);
> > >       pci_set_drvdata(dev, vmd);
> > >       err = vmd_enable_domain(vmd, features);
> > > @@ -1106,7 +1119,8 @@ static const struct pci_device_id vmd_ids[] = {
> > >       {PCI_VDEVICE(INTEL, 0xa77f),
> > >               .driver_data = VMD_FEATS_CLIENT,},
> > >       {PCI_VDEVICE(INTEL, 0x7d0b),
> > > -             .driver_data = VMD_FEATS_CLIENT,},
> > > +             .driver_data = VMD_FEATS_CLIENT |
> > > +                            VMD_FEAT_INTERRUPT_QUIRK,},
> > >       {PCI_VDEVICE(INTEL, 0xad0b),
> > >               .driver_data = VMD_FEATS_CLIENT,},
> > >       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
> > > --
> > > 2.43.0
> > >
> >
> > --
> > மணிவண்ணன் சதாசிவம்
Keith Busch Sept. 3, 2024, 2:51 p.m. UTC | #4
On Tue, Sep 03, 2024 at 03:07:45PM +0800, Kai-Heng Feng wrote:
> On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> <manivannan.sadhasivam@linaro.org> wrote:
> >
> > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng wrote:
> > > Meteor Lake VMD has a bug that the IRQ raises before the DMA region is
> > > ready, so the requested IO is considered never completed:
> > > [   97.343423] nvme nvme0: I/O 259 QID 2 timeout, completion polled
> > > [   97.343446] nvme nvme0: I/O 384 QID 3 timeout, completion polled
> > > [   97.343459] nvme nvme0: I/O 320 QID 4 timeout, completion polled
> > > [   97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion polled
> > >
> > > The is documented as erratum MTL016 [0]. The suggested workaround is to
> > > "The VMD MSI interrupt-handler should initially perform a dummy register
> > > read to the MSI initiator device prior to any writes to ensure proper
> > > PCIe ordering." which essentially is adding a delay before the interrupt
> > > handling.
> > >
> >
> > Why can't you add a dummy register read instead? Adding a delay for PCIe
> > ordering is not going to work always.
> 
> This can be done too. But it can take longer than 4us delay, so I'd
> like to keep it a bit faster here.

An added delay is just a side effect of the read. The read flushes
pending device-to-host writes, which is most likely what the errata
really requires. I think Mani is right, you need to pay that register
read penalty to truly fix this.

> > > +     /* Erratum MTL016 */
> > > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> > >  };
> > >
> > >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728 ns */
> > > @@ -90,6 +94,8 @@ static DEFINE_IDA(vmd_instance_ida);
> > >   */
> > >  static DEFINE_RAW_SPINLOCK(list_lock);
> > >
> > > +static bool interrupt_delay;
> > > +
> > >  /**
> > >   * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
> > >   * @node:    list item for parent traversal.
> > > @@ -105,6 +111,7 @@ struct vmd_irq {
> > >       struct vmd_irq_list     *irq;
> > >       bool                    enabled;
> > >       unsigned int            virq;
> > > +     bool                    delay_irq;
> >
> > This is unused. Perhaps you wanted to use this instead of interrupt_delay?
> 
> This is leftover, will scratch this.

Maybe you should actually use it instead of making a global? The quirk
says it is device specific, so no need to punish every device if it
doesn't need it (unlikely as it is to see such a thing).
Nirmal Patel Sept. 3, 2024, 3:29 p.m. UTC | #5
On Tue, 3 Sep 2024 15:07:45 +0800
Kai-Heng Feng <kai.heng.feng@canonical.com> wrote:

> On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> <manivannan.sadhasivam@linaro.org> wrote:
> >
> > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng wrote:  
> > > Meteor Lake VMD has a bug that the IRQ raises before the DMA
> > > region is ready, so the requested IO is considered never
> > > completed: [   97.343423] nvme nvme0: I/O 259 QID 2 timeout,
> > > completion polled [   97.343446] nvme nvme0: I/O 384 QID 3
> > > timeout, completion polled [   97.343459] nvme nvme0: I/O 320 QID
> > > 4 timeout, completion polled [   97.343470] nvme nvme0: I/O 707
> > > QID 5 timeout, completion polled
> > >
> > > The is documented as erratum MTL016 [0]. The suggested workaround
> > > is to "The VMD MSI interrupt-handler should initially perform a
> > > dummy register read to the MSI initiator device prior to any
> > > writes to ensure proper PCIe ordering." which essentially is
> > > adding a delay before the interrupt handling.
> > >  
> >
> > Why can't you add a dummy register read instead? Adding a delay for
> > PCIe ordering is not going to work always.  
> 
> This can be done too. But it can take longer than 4us delay, so I'd
> like to keep it a bit faster here.

Since the issue is due to the bug in silicon and we have limited
options. If community is okay to take some performance hit, then please
add more details about the patch above VMD_FEAT_INTERRUPT_QUIRK.

-nirmal
> 
> >  
> > > Hence add a delay before handle interrupt to workaround the
> > > erratum.
> > >
> > > [0]
> > > https://edc.intel.com/content/www/us/en/design/products/platforms/details/meteor-lake-u-p/core-ultra-processor-specification-update/errata-details/#MTL016
> > >
> > > Link: https://bugzilla.kernel.org/show_bug.cgi?id=217871
> > > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> > > ---
> > >  drivers/pci/controller/vmd.c | 18 ++++++++++++++++--
> > >  1 file changed, 16 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/pci/controller/vmd.c
> > > b/drivers/pci/controller/vmd.c index a726de0af011..3433b3730f9c
> > > 100644 --- a/drivers/pci/controller/vmd.c
> > > +++ b/drivers/pci/controller/vmd.c
> > > @@ -16,6 +16,7 @@
> > >  #include <linux/srcu.h>
> > >  #include <linux/rculist.h>
> > >  #include <linux/rcupdate.h>
> > > +#include <linux/delay.h>
> > >
> > >  #include <asm/irqdomain.h>
> > >
> > > @@ -74,6 +75,9 @@ enum vmd_features {
> > >        * proper power management of the SoC.
> > >        */
> > >       VMD_FEAT_BIOS_PM_QUIRK          = (1 << 5),
> > > +
> > > +     /* Erratum MTL016 */
> > > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> > >  };
> > >
> > >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728 ns */
> > > @@ -90,6 +94,8 @@ static DEFINE_IDA(vmd_instance_ida);
> > >   */
> > >  static DEFINE_RAW_SPINLOCK(list_lock);
> > >
> > > +static bool interrupt_delay;
> > > +
> > >  /**
> > >   * struct vmd_irq - private data to map driver IRQ to the VMD
> > > shared vector
> > >   * @node:    list item for parent traversal.
> > > @@ -105,6 +111,7 @@ struct vmd_irq {
> > >       struct vmd_irq_list     *irq;
> > >       bool                    enabled;
> > >       unsigned int            virq;
> > > +     bool                    delay_irq;  
> >
> > This is unused. Perhaps you wanted to use this instead of
> > interrupt_delay?  
> 
> This is leftover, will scratch this.
> 
> Kai-Heng
> 
> >
> > - Mani
> >  
> > >  };
> > >
> > >  /**
> > > @@ -680,8 +687,11 @@ static irqreturn_t vmd_irq(int irq, void
> > > *data) int idx;
> > >
> > >       idx = srcu_read_lock(&irqs->srcu);
> > > -     list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
> > > +     list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node) {
> > > +             if (interrupt_delay)
> > > +                     udelay(4);
> > >               generic_handle_irq(vmdirq->virq);
> > > +     }
> > >       srcu_read_unlock(&irqs->srcu, idx);
> > >
> > >       return IRQ_HANDLED;
> > > @@ -1015,6 +1025,9 @@ static int vmd_probe(struct pci_dev *dev,
> > > const struct pci_device_id *id) if (features &
> > > VMD_FEAT_OFFSET_FIRST_VECTOR) vmd->first_vec = 1;
> > >
> > > +     if (features & VMD_FEAT_INTERRUPT_QUIRK)
> > > +             interrupt_delay = true;
> > > +
> > >       spin_lock_init(&vmd->cfg_lock);
> > >       pci_set_drvdata(dev, vmd);
> > >       err = vmd_enable_domain(vmd, features);
> > > @@ -1106,7 +1119,8 @@ static const struct pci_device_id vmd_ids[]
> > > = { {PCI_VDEVICE(INTEL, 0xa77f),
> > >               .driver_data = VMD_FEATS_CLIENT,},
> > >       {PCI_VDEVICE(INTEL, 0x7d0b),
> > > -             .driver_data = VMD_FEATS_CLIENT,},
> > > +             .driver_data = VMD_FEATS_CLIENT |
> > > +                            VMD_FEAT_INTERRUPT_QUIRK,},
> > >       {PCI_VDEVICE(INTEL, 0xad0b),
> > >               .driver_data = VMD_FEATS_CLIENT,},
> > >       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
> > > --
> > > 2.43.0
> > >  
> >
> > --
> > மணிவண்ணன் சதாசிவம்
kernel test robot Sept. 3, 2024, 4:17 p.m. UTC | #6
Hi Kai-Heng,

kernel test robot noticed the following build warnings:

[auto build test WARNING on pci/next]
[also build test WARNING on pci/for-linus linus/master v6.11-rc6 next-20240903]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Kai-Heng-Feng/PCI-vmd-Delay-interrupt-handling-on-MTL-VMD-controller/20240903-110553
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20240903025544.286223-1-kai.heng.feng%40canonical.com
patch subject: [PATCH] PCI: vmd: Delay interrupt handling on MTL VMD controller
config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20240904/202409040016.XGnUy9HW-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240904/202409040016.XGnUy9HW-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409040016.XGnUy9HW-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/pci/controller/vmd.c:115: warning: Function parameter or struct member 'delay_irq' not described in 'vmd_irq'


vim +115 drivers/pci/controller/vmd.c

7cdbc4e9cd808b5 drivers/pci/controller/vmd.c Kai-Heng Feng 2024-09-03   98  
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12   99  /**
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  100   * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  101   * @node:	list item for parent traversal.
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  102   * @irq:	back pointer to parent.
21c80c9fefc3db1 arch/x86/pci/vmd.c           Keith Busch   2016-08-23  103   * @enabled:	true if driver enabled IRQ
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  104   * @virq:	the virtual IRQ value provided to the requesting driver.
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  105   *
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  106   * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  107   * a VMD IRQ using this structure.
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  108   */
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  109  struct vmd_irq {
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  110  	struct list_head	node;
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  111  	struct vmd_irq_list	*irq;
21c80c9fefc3db1 arch/x86/pci/vmd.c           Keith Busch   2016-08-23  112  	bool			enabled;
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  113  	unsigned int		virq;
7cdbc4e9cd808b5 drivers/pci/controller/vmd.c Kai-Heng Feng 2024-09-03  114  	bool			delay_irq;
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12 @115  };
185a383ada2e779 arch/x86/pci/vmd.c           Keith Busch   2016-01-12  116
Kai-Heng Feng Sept. 4, 2024, 1:57 a.m. UTC | #7
On Tue, Sep 3, 2024 at 10:51 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Tue, Sep 03, 2024 at 03:07:45PM +0800, Kai-Heng Feng wrote:
> > On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> > <manivannan.sadhasivam@linaro.org> wrote:
> > >
> > > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng wrote:
> > > > Meteor Lake VMD has a bug that the IRQ raises before the DMA region is
> > > > ready, so the requested IO is considered never completed:
> > > > [   97.343423] nvme nvme0: I/O 259 QID 2 timeout, completion polled
> > > > [   97.343446] nvme nvme0: I/O 384 QID 3 timeout, completion polled
> > > > [   97.343459] nvme nvme0: I/O 320 QID 4 timeout, completion polled
> > > > [   97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion polled
> > > >
> > > > The is documented as erratum MTL016 [0]. The suggested workaround is to
> > > > "The VMD MSI interrupt-handler should initially perform a dummy register
> > > > read to the MSI initiator device prior to any writes to ensure proper
> > > > PCIe ordering." which essentially is adding a delay before the interrupt
> > > > handling.
> > > >
> > >
> > > Why can't you add a dummy register read instead? Adding a delay for PCIe
> > > ordering is not going to work always.
> >
> > This can be done too. But it can take longer than 4us delay, so I'd
> > like to keep it a bit faster here.
>
> An added delay is just a side effect of the read. The read flushes
> pending device-to-host writes, which is most likely what the errata
> really requires. I think Mani is right, you need to pay that register
> read penalty to truly fix this.

OK, will change the quirk to perform dummy register read.

But I am not sure which is the "MSI initiator device", is it VMD
controller or NVMe devices?

Kai-Heng

>
> > > > +     /* Erratum MTL016 */
> > > > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> > > >  };
> > > >
> > > >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728 ns */
> > > > @@ -90,6 +94,8 @@ static DEFINE_IDA(vmd_instance_ida);
> > > >   */
> > > >  static DEFINE_RAW_SPINLOCK(list_lock);
> > > >
> > > > +static bool interrupt_delay;
> > > > +
> > > >  /**
> > > >   * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
> > > >   * @node:    list item for parent traversal.
> > > > @@ -105,6 +111,7 @@ struct vmd_irq {
> > > >       struct vmd_irq_list     *irq;
> > > >       bool                    enabled;
> > > >       unsigned int            virq;
> > > > +     bool                    delay_irq;
> > >
> > > This is unused. Perhaps you wanted to use this instead of interrupt_delay?
> >
> > This is leftover, will scratch this.
>
> Maybe you should actually use it instead of making a global? The quirk
> says it is device specific, so no need to punish every device if it
> doesn't need it (unlikely as it is to see such a thing).
Manivannan Sadhasivam Sept. 4, 2024, 6:22 a.m. UTC | #8
On Wed, Sep 04, 2024 at 09:57:08AM +0800, Kai-Heng Feng wrote:
> On Tue, Sep 3, 2024 at 10:51 PM Keith Busch <kbusch@kernel.org> wrote:
> >
> > On Tue, Sep 03, 2024 at 03:07:45PM +0800, Kai-Heng Feng wrote:
> > > On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> > > <manivannan.sadhasivam@linaro.org> wrote:
> > > >
> > > > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng wrote:
> > > > > Meteor Lake VMD has a bug that the IRQ raises before the DMA region is
> > > > > ready, so the requested IO is considered never completed:
> > > > > [   97.343423] nvme nvme0: I/O 259 QID 2 timeout, completion polled
> > > > > [   97.343446] nvme nvme0: I/O 384 QID 3 timeout, completion polled
> > > > > [   97.343459] nvme nvme0: I/O 320 QID 4 timeout, completion polled
> > > > > [   97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion polled
> > > > >
> > > > > The is documented as erratum MTL016 [0]. The suggested workaround is to
> > > > > "The VMD MSI interrupt-handler should initially perform a dummy register
> > > > > read to the MSI initiator device prior to any writes to ensure proper
> > > > > PCIe ordering." which essentially is adding a delay before the interrupt
> > > > > handling.
> > > > >
> > > >
> > > > Why can't you add a dummy register read instead? Adding a delay for PCIe
> > > > ordering is not going to work always.
> > >
> > > This can be done too. But it can take longer than 4us delay, so I'd
> > > like to keep it a bit faster here.
> >
> > An added delay is just a side effect of the read. The read flushes
> > pending device-to-host writes, which is most likely what the errata
> > really requires. I think Mani is right, you need to pay that register
> > read penalty to truly fix this.
> 
> OK, will change the quirk to perform dummy register read.
> 
> But I am not sure which is the "MSI initiator device", is it VMD
> controller or NVMe devices?
> 

'MSI initiator' should be the NVMe device. My understanding is that the
workaround suggests reading the NVMe register from the MSI handler before doing
any write to the device to ensures that the previous writes from the device are
flushed.

And this sounds like the workaround should be done in the NVMe driver as it has
the knowledge of the NVMe registers. But isn't the NVMe driver already reading
CQE status first up in the ISR?

- Mani

> Kai-Heng
> 
> >
> > > > > +     /* Erratum MTL016 */
> > > > > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> > > > >  };
> > > > >
> > > > >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728 ns */
> > > > > @@ -90,6 +94,8 @@ static DEFINE_IDA(vmd_instance_ida);
> > > > >   */
> > > > >  static DEFINE_RAW_SPINLOCK(list_lock);
> > > > >
> > > > > +static bool interrupt_delay;
> > > > > +
> > > > >  /**
> > > > >   * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
> > > > >   * @node:    list item for parent traversal.
> > > > > @@ -105,6 +111,7 @@ struct vmd_irq {
> > > > >       struct vmd_irq_list     *irq;
> > > > >       bool                    enabled;
> > > > >       unsigned int            virq;
> > > > > +     bool                    delay_irq;
> > > >
> > > > This is unused. Perhaps you wanted to use this instead of interrupt_delay?
> > >
> > > This is leftover, will scratch this.
> >
> > Maybe you should actually use it instead of making a global? The quirk
> > says it is device specific, so no need to punish every device if it
> > doesn't need it (unlikely as it is to see such a thing).
Kai-Heng Feng Sept. 6, 2024, 1:56 a.m. UTC | #9
On Wed, Sep 4, 2024 at 2:22 PM Manivannan Sadhasivam
<manivannan.sadhasivam@linaro.org> wrote:
>
> On Wed, Sep 04, 2024 at 09:57:08AM +0800, Kai-Heng Feng wrote:
> > On Tue, Sep 3, 2024 at 10:51 PM Keith Busch <kbusch@kernel.org> wrote:
> > >
> > > On Tue, Sep 03, 2024 at 03:07:45PM +0800, Kai-Heng Feng wrote:
> > > > On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> > > > <manivannan.sadhasivam@linaro.org> wrote:
> > > > >
> > > > > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng wrote:
> > > > > > Meteor Lake VMD has a bug that the IRQ raises before the DMA region is
> > > > > > ready, so the requested IO is considered never completed:
> > > > > > [   97.343423] nvme nvme0: I/O 259 QID 2 timeout, completion polled
> > > > > > [   97.343446] nvme nvme0: I/O 384 QID 3 timeout, completion polled
> > > > > > [   97.343459] nvme nvme0: I/O 320 QID 4 timeout, completion polled
> > > > > > [   97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion polled
> > > > > >
> > > > > > The is documented as erratum MTL016 [0]. The suggested workaround is to
> > > > > > "The VMD MSI interrupt-handler should initially perform a dummy register
> > > > > > read to the MSI initiator device prior to any writes to ensure proper
> > > > > > PCIe ordering." which essentially is adding a delay before the interrupt
> > > > > > handling.
> > > > > >
> > > > >
> > > > > Why can't you add a dummy register read instead? Adding a delay for PCIe
> > > > > ordering is not going to work always.
> > > >
> > > > This can be done too. But it can take longer than 4us delay, so I'd
> > > > like to keep it a bit faster here.
> > >
> > > An added delay is just a side effect of the read. The read flushes
> > > pending device-to-host writes, which is most likely what the errata
> > > really requires. I think Mani is right, you need to pay that register
> > > read penalty to truly fix this.
> >
> > OK, will change the quirk to perform dummy register read.
> >
> > But I am not sure which is the "MSI initiator device", is it VMD
> > controller or NVMe devices?
> >
>
> 'MSI initiator' should be the NVMe device. My understanding is that the
> workaround suggests reading the NVMe register from the MSI handler before doing
> any write to the device to ensures that the previous writes from the device are
> flushed.

Hmm, it would be really great to contain the quirk in VMD controller.
Is there anyway to do that right before generic_handle_irq()?

>
> And this sounds like the workaround should be done in the NVMe driver as it has
> the knowledge of the NVMe registers. But isn't the NVMe driver already reading
> CQE status first up in the ISR?

The VMD interrupt is fired before the CQE status update, hence the bug.

Kai-Heng

>
> - Mani
>
> > Kai-Heng
> >
> > >
> > > > > > +     /* Erratum MTL016 */
> > > > > > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> > > > > >  };
> > > > > >
> > > > > >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728 ns */
> > > > > > @@ -90,6 +94,8 @@ static DEFINE_IDA(vmd_instance_ida);
> > > > > >   */
> > > > > >  static DEFINE_RAW_SPINLOCK(list_lock);
> > > > > >
> > > > > > +static bool interrupt_delay;
> > > > > > +
> > > > > >  /**
> > > > > >   * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
> > > > > >   * @node:    list item for parent traversal.
> > > > > > @@ -105,6 +111,7 @@ struct vmd_irq {
> > > > > >       struct vmd_irq_list     *irq;
> > > > > >       bool                    enabled;
> > > > > >       unsigned int            virq;
> > > > > > +     bool                    delay_irq;
> > > > >
> > > > > This is unused. Perhaps you wanted to use this instead of interrupt_delay?
> > > >
> > > > This is leftover, will scratch this.
> > >
> > > Maybe you should actually use it instead of making a global? The quirk
> > > says it is device specific, so no need to punish every device if it
> > > doesn't need it (unlikely as it is to see such a thing).
>
> --
> மணிவண்ணன் சதாசிவம்
Nirmal Patel Sept. 12, 2024, 5:45 p.m. UTC | #10
On Fri, 6 Sep 2024 09:56:59 +0800
Kai-Heng Feng <kai.heng.feng@canonical.com> wrote:

> On Wed, Sep 4, 2024 at 2:22 PM Manivannan Sadhasivam
> <manivannan.sadhasivam@linaro.org> wrote:
> >
> > On Wed, Sep 04, 2024 at 09:57:08AM +0800, Kai-Heng Feng wrote:  
> > > On Tue, Sep 3, 2024 at 10:51 PM Keith Busch <kbusch@kernel.org>
> > > wrote:  
> > > >
> > > > On Tue, Sep 03, 2024 at 03:07:45PM +0800, Kai-Heng Feng wrote:  
> > > > > On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> > > > > <manivannan.sadhasivam@linaro.org> wrote:  
> > > > > >
> > > > > > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng
> > > > > > wrote:  
> > > > > > > Meteor Lake VMD has a bug that the IRQ raises before the
> > > > > > > DMA region is ready, so the requested IO is considered
> > > > > > > never completed: [   97.343423] nvme nvme0: I/O 259 QID 2
> > > > > > > timeout, completion polled [   97.343446] nvme nvme0: I/O
> > > > > > > 384 QID 3 timeout, completion polled [   97.343459] nvme
> > > > > > > nvme0: I/O 320 QID 4 timeout, completion polled [
> > > > > > > 97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion
> > > > > > > polled
> > > > > > >
> > > > > > > The is documented as erratum MTL016 [0]. The suggested
> > > > > > > workaround is to "The VMD MSI interrupt-handler should
> > > > > > > initially perform a dummy register read to the MSI
> > > > > > > initiator device prior to any writes to ensure proper
> > > > > > > PCIe ordering." which essentially is adding a delay
> > > > > > > before the interrupt handling. 
> > > > > >
> > > > > > Why can't you add a dummy register read instead? Adding a
> > > > > > delay for PCIe ordering is not going to work always.  
> > > > >
> > > > > This can be done too. But it can take longer than 4us delay,
> > > > > so I'd like to keep it a bit faster here.  
> > > >
> > > > An added delay is just a side effect of the read. The read
> > > > flushes pending device-to-host writes, which is most likely
> > > > what the errata really requires. I think Mani is right, you
> > > > need to pay that register read penalty to truly fix this.  
> > >
> > > OK, will change the quirk to perform dummy register read.
> > >
> > > But I am not sure which is the "MSI initiator device", is it VMD
> > > controller or NVMe devices?
> > >  
> >
> > 'MSI initiator' should be the NVMe device. My understanding is that
> > the workaround suggests reading the NVMe register from the MSI
> > handler before doing any write to the device to ensures that the
> > previous writes from the device are flushed.  
> 
> Hmm, it would be really great to contain the quirk in VMD controller.
> Is there anyway to do that right before generic_handle_irq()?
> 
The bug is in hardware, I agree with Kai-Heng to contain it to VMD
controller.
 
> >
> > And this sounds like the workaround should be done in the NVMe
> > driver as it has the knowledge of the NVMe registers. But isn't the
> > NVMe driver already reading CQE status first up in the ISR?  
> 
> The VMD interrupt is fired before the CQE status update, hence the
> bug.
> 
> Kai-Heng
> 
> >
> > - Mani
> >  
> > > Kai-Heng
> > >  
> > > >  
> > > > > > > +     /* Erratum MTL016 */
> > > > > > > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> > > > > > >  };
> > > > > > >
> > > > > > >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728
> > > > > > > ns */ @@ -90,6 +94,8 @@ static
> > > > > > > DEFINE_IDA(vmd_instance_ida); */
> > > > > > >  static DEFINE_RAW_SPINLOCK(list_lock);
> > > > > > >
> > > > > > > +static bool interrupt_delay;
> > > > > > > +
> > > > > > >  /**
> > > > > > >   * struct vmd_irq - private data to map driver IRQ to
> > > > > > > the VMD shared vector
> > > > > > >   * @node:    list item for parent traversal.
> > > > > > > @@ -105,6 +111,7 @@ struct vmd_irq {
> > > > > > >       struct vmd_irq_list     *irq;
> > > > > > >       bool                    enabled;
> > > > > > >       unsigned int            virq;
> > > > > > > +     bool                    delay_irq;  
> > > > > >
> > > > > > This is unused. Perhaps you wanted to use this instead of
> > > > > > interrupt_delay?  
> > > > >
> > > > > This is leftover, will scratch this.  
> > > >
> > > > Maybe you should actually use it instead of making a global?
> > > > The quirk says it is device specific, so no need to punish
> > > > every device if it doesn't need it (unlikely as it is to see
> > > > such a thing).  
> >
> > --
> > மணிவண்ணன் சதாசிவம்
Kai-Heng Feng Sept. 13, 2024, 5:55 a.m. UTC | #11
Hi Nirmal,

On Fri, Sep 13, 2024 at 1:45 AM Nirmal Patel
<nirmal.patel@linux.intel.com> wrote:
>
> On Fri, 6 Sep 2024 09:56:59 +0800
> Kai-Heng Feng <kai.heng.feng@canonical.com> wrote:
>
> > On Wed, Sep 4, 2024 at 2:22 PM Manivannan Sadhasivam
> > <manivannan.sadhasivam@linaro.org> wrote:
> > >
> > > On Wed, Sep 04, 2024 at 09:57:08AM +0800, Kai-Heng Feng wrote:
> > > > On Tue, Sep 3, 2024 at 10:51 PM Keith Busch <kbusch@kernel.org>
> > > > wrote:
> > > > >
> > > > > On Tue, Sep 03, 2024 at 03:07:45PM +0800, Kai-Heng Feng wrote:
> > > > > > On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> > > > > > <manivannan.sadhasivam@linaro.org> wrote:
> > > > > > >
> > > > > > > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng
> > > > > > > wrote:
> > > > > > > > Meteor Lake VMD has a bug that the IRQ raises before the
> > > > > > > > DMA region is ready, so the requested IO is considered
> > > > > > > > never completed: [   97.343423] nvme nvme0: I/O 259 QID 2
> > > > > > > > timeout, completion polled [   97.343446] nvme nvme0: I/O
> > > > > > > > 384 QID 3 timeout, completion polled [   97.343459] nvme
> > > > > > > > nvme0: I/O 320 QID 4 timeout, completion polled [
> > > > > > > > 97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion
> > > > > > > > polled
> > > > > > > >
> > > > > > > > The is documented as erratum MTL016 [0]. The suggested
> > > > > > > > workaround is to "The VMD MSI interrupt-handler should
> > > > > > > > initially perform a dummy register read to the MSI
> > > > > > > > initiator device prior to any writes to ensure proper
> > > > > > > > PCIe ordering." which essentially is adding a delay
> > > > > > > > before the interrupt handling.
> > > > > > >
> > > > > > > Why can't you add a dummy register read instead? Adding a
> > > > > > > delay for PCIe ordering is not going to work always.
> > > > > >
> > > > > > This can be done too. But it can take longer than 4us delay,
> > > > > > so I'd like to keep it a bit faster here.
> > > > >
> > > > > An added delay is just a side effect of the read. The read
> > > > > flushes pending device-to-host writes, which is most likely
> > > > > what the errata really requires. I think Mani is right, you
> > > > > need to pay that register read penalty to truly fix this.
> > > >
> > > > OK, will change the quirk to perform dummy register read.
> > > >
> > > > But I am not sure which is the "MSI initiator device", is it VMD
> > > > controller or NVMe devices?
> > > >
> > >
> > > 'MSI initiator' should be the NVMe device. My understanding is that
> > > the workaround suggests reading the NVMe register from the MSI
> > > handler before doing any write to the device to ensures that the
> > > previous writes from the device are flushed.
> >
> > Hmm, it would be really great to contain the quirk in VMD controller.
> > Is there anyway to do that right before generic_handle_irq()?
> >
> The bug is in hardware, I agree with Kai-Heng to contain it to VMD
> controller.

The problem I am facing right now is that I can't connect the virq to
NVMe's struct device to implement the quirk.

Do you have any idea how to achieve that?

Kai-Heng

>
> > >
> > > And this sounds like the workaround should be done in the NVMe
> > > driver as it has the knowledge of the NVMe registers. But isn't the
> > > NVMe driver already reading CQE status first up in the ISR?
> >
> > The VMD interrupt is fired before the CQE status update, hence the
> > bug.
> >
> > Kai-Heng
> >
> > >
> > > - Mani
> > >
> > > > Kai-Heng
> > > >
> > > > >
> > > > > > > > +     /* Erratum MTL016 */
> > > > > > > > +     VMD_FEAT_INTERRUPT_QUIRK        = (1 << 6),
> > > > > > > >  };
> > > > > > > >
> > > > > > > >  #define VMD_BIOS_PM_QUIRK_LTR        0x1003  /* 3145728
> > > > > > > > ns */ @@ -90,6 +94,8 @@ static
> > > > > > > > DEFINE_IDA(vmd_instance_ida); */
> > > > > > > >  static DEFINE_RAW_SPINLOCK(list_lock);
> > > > > > > >
> > > > > > > > +static bool interrupt_delay;
> > > > > > > > +
> > > > > > > >  /**
> > > > > > > >   * struct vmd_irq - private data to map driver IRQ to
> > > > > > > > the VMD shared vector
> > > > > > > >   * @node:    list item for parent traversal.
> > > > > > > > @@ -105,6 +111,7 @@ struct vmd_irq {
> > > > > > > >       struct vmd_irq_list     *irq;
> > > > > > > >       bool                    enabled;
> > > > > > > >       unsigned int            virq;
> > > > > > > > +     bool                    delay_irq;
> > > > > > >
> > > > > > > This is unused. Perhaps you wanted to use this instead of
> > > > > > > interrupt_delay?
> > > > > >
> > > > > > This is leftover, will scratch this.
> > > > >
> > > > > Maybe you should actually use it instead of making a global?
> > > > > The quirk says it is device specific, so no need to punish
> > > > > every device if it doesn't need it (unlikely as it is to see
> > > > > such a thing).
> > >
> > > --
> > > மணிவண்ணன் சதாசிவம்
>
Manivannan Sadhasivam Sept. 13, 2024, 11:11 a.m. UTC | #12
On Fri, Sep 13, 2024 at 01:55:49PM +0800, Kai-Heng Feng wrote:
> Hi Nirmal,
> 
> On Fri, Sep 13, 2024 at 1:45 AM Nirmal Patel
> <nirmal.patel@linux.intel.com> wrote:
> >
> > On Fri, 6 Sep 2024 09:56:59 +0800
> > Kai-Heng Feng <kai.heng.feng@canonical.com> wrote:
> >
> > > On Wed, Sep 4, 2024 at 2:22 PM Manivannan Sadhasivam
> > > <manivannan.sadhasivam@linaro.org> wrote:
> > > >
> > > > On Wed, Sep 04, 2024 at 09:57:08AM +0800, Kai-Heng Feng wrote:
> > > > > On Tue, Sep 3, 2024 at 10:51 PM Keith Busch <kbusch@kernel.org>
> > > > > wrote:
> > > > > >
> > > > > > On Tue, Sep 03, 2024 at 03:07:45PM +0800, Kai-Heng Feng wrote:
> > > > > > > On Tue, Sep 3, 2024 at 12:29 PM Manivannan Sadhasivam
> > > > > > > <manivannan.sadhasivam@linaro.org> wrote:
> > > > > > > >
> > > > > > > > On Tue, Sep 03, 2024 at 10:55:44AM +0800, Kai-Heng Feng
> > > > > > > > wrote:
> > > > > > > > > Meteor Lake VMD has a bug that the IRQ raises before the
> > > > > > > > > DMA region is ready, so the requested IO is considered
> > > > > > > > > never completed: [   97.343423] nvme nvme0: I/O 259 QID 2
> > > > > > > > > timeout, completion polled [   97.343446] nvme nvme0: I/O
> > > > > > > > > 384 QID 3 timeout, completion polled [   97.343459] nvme
> > > > > > > > > nvme0: I/O 320 QID 4 timeout, completion polled [
> > > > > > > > > 97.343470] nvme nvme0: I/O 707 QID 5 timeout, completion
> > > > > > > > > polled
> > > > > > > > >
> > > > > > > > > The is documented as erratum MTL016 [0]. The suggested
> > > > > > > > > workaround is to "The VMD MSI interrupt-handler should
> > > > > > > > > initially perform a dummy register read to the MSI
> > > > > > > > > initiator device prior to any writes to ensure proper
> > > > > > > > > PCIe ordering." which essentially is adding a delay
> > > > > > > > > before the interrupt handling.
> > > > > > > >
> > > > > > > > Why can't you add a dummy register read instead? Adding a
> > > > > > > > delay for PCIe ordering is not going to work always.
> > > > > > >
> > > > > > > This can be done too. But it can take longer than 4us delay,
> > > > > > > so I'd like to keep it a bit faster here.
> > > > > >
> > > > > > An added delay is just a side effect of the read. The read
> > > > > > flushes pending device-to-host writes, which is most likely
> > > > > > what the errata really requires. I think Mani is right, you
> > > > > > need to pay that register read penalty to truly fix this.
> > > > >
> > > > > OK, will change the quirk to perform dummy register read.
> > > > >
> > > > > But I am not sure which is the "MSI initiator device", is it VMD
> > > > > controller or NVMe devices?
> > > > >
> > > >
> > > > 'MSI initiator' should be the NVMe device. My understanding is that
> > > > the workaround suggests reading the NVMe register from the MSI
> > > > handler before doing any write to the device to ensures that the
> > > > previous writes from the device are flushed.
> > >
> > > Hmm, it would be really great to contain the quirk in VMD controller.
> > > Is there anyway to do that right before generic_handle_irq()?
> > >
> > The bug is in hardware, I agree with Kai-Heng to contain it to VMD
> > controller.
> 

I'd love to, but if I read the workaround correctly, it suggests reading the
register of the MSI initiator device, which is NVMe. IDK, how you can read the
NVMe register from the VMD driver.

> The problem I am facing right now is that I can't connect the virq to
> NVMe's struct device to implement the quirk.
> 
> Do you have any idea how to achieve that?
> 
> Kai-Heng
> 
> >
> > > >
> > > > And this sounds like the workaround should be done in the NVMe
> > > > driver as it has the knowledge of the NVMe registers. But isn't the
> > > > NVMe driver already reading CQE status first up in the ISR?
> > >
> > > The VMD interrupt is fired before the CQE status update, hence the
> > > bug.

I'm not able to understand the bug properly. The erratum indicates that the MSI
from device reaches the VMD before other writes to the registers. So this is an
ordering issue as MSI takes precedence over other writes from the device.

So the workaround is to read the device register in the MSI handler to make sure
the previous writes from the device are flushed. IIUC, once the MSI reaches the
VMD, it will trigger the IRQ handler in the NVMe driver and in the handler, CQE
status register is read first up. This flow matches with the workaround
suggested.

Is any write being performed to the NVMe device before reading any register in
the MSI handler? Or the current CQE read is not able to satisfy the workaround?
Please clarify.

- Mani
Keith Busch Sept. 13, 2024, 3:24 p.m. UTC | #13
On Fri, Sep 13, 2024 at 04:41:42PM +0530, Manivannan Sadhasivam wrote:
> I'm not able to understand the bug properly. The erratum indicates that the MSI
> from device reaches the VMD before other writes to the registers. So this is an
> ordering issue as MSI takes precedence over other writes from the device.
> 
> So the workaround is to read the device register in the MSI handler to make sure
> the previous writes from the device are flushed. IIUC, once the MSI reaches the
> VMD, it will trigger the IRQ handler in the NVMe driver and in the handler, CQE
> status register is read first up. This flow matches with the workaround
> suggested.
> 
> Is any write being performed to the NVMe device before reading any register in
> the MSI handler? Or the current CQE read is not able to satisfy the workaround?
> Please clarify.

The CQE is not a device register. It exists in host memory, so reading
that from the driver isn't going to flush writes from IO devices.
Manivannan Sadhasivam Sept. 13, 2024, 4:14 p.m. UTC | #14
On Fri, Sep 13, 2024 at 09:24:29AM -0600, Keith Busch wrote:
> On Fri, Sep 13, 2024 at 04:41:42PM +0530, Manivannan Sadhasivam wrote:
> > I'm not able to understand the bug properly. The erratum indicates that the MSI
> > from device reaches the VMD before other writes to the registers. So this is an
> > ordering issue as MSI takes precedence over other writes from the device.
> > 
> > So the workaround is to read the device register in the MSI handler to make sure
> > the previous writes from the device are flushed. IIUC, once the MSI reaches the
> > VMD, it will trigger the IRQ handler in the NVMe driver and in the handler, CQE
> > status register is read first up. This flow matches with the workaround
> > suggested.
> > 
> > Is any write being performed to the NVMe device before reading any register in
> > the MSI handler? Or the current CQE read is not able to satisfy the workaround?
> > Please clarify.
> 
> The CQE is not a device register. It exists in host memory, so reading
> that from the driver isn't going to flush writes from IO devices.

Ah okay, it makes sense now. Thanks for clarifying.

For the workaround, does it make sense to have a platform specific quirk in the
NVMe driver? Because, reading the NVMe device register from VMD driver doesn't
look plausible to me.

- Mani
Keith Busch Sept. 13, 2024, 4:34 p.m. UTC | #15
On Fri, Sep 13, 2024 at 09:44:47PM +0530, Manivannan Sadhasivam wrote:
> 
> For the workaround, does it make sense to have a platform specific quirk in the
> NVMe driver? Because, reading the NVMe device register from VMD driver doesn't
> look plausible to me.

I don't know.

Does the breakage really need a register read on the device that
dispatched the original MSI? The VMD driver has no idea which device
sent it. If you really need to query every single device in the domain,
your performance is going to tank. It's bad enough issuing just a single
read in the IRQ handler, but to all of them?
diff mbox series

Patch

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index a726de0af011..3433b3730f9c 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -16,6 +16,7 @@ 
 #include <linux/srcu.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
+#include <linux/delay.h>
 
 #include <asm/irqdomain.h>
 
@@ -74,6 +75,9 @@  enum vmd_features {
 	 * proper power management of the SoC.
 	 */
 	VMD_FEAT_BIOS_PM_QUIRK		= (1 << 5),
+
+	/* Erratum MTL016 */
+	VMD_FEAT_INTERRUPT_QUIRK	= (1 << 6),
 };
 
 #define VMD_BIOS_PM_QUIRK_LTR	0x1003	/* 3145728 ns */
@@ -90,6 +94,8 @@  static DEFINE_IDA(vmd_instance_ida);
  */
 static DEFINE_RAW_SPINLOCK(list_lock);
 
+static bool interrupt_delay;
+
 /**
  * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
  * @node:	list item for parent traversal.
@@ -105,6 +111,7 @@  struct vmd_irq {
 	struct vmd_irq_list	*irq;
 	bool			enabled;
 	unsigned int		virq;
+	bool			delay_irq;
 };
 
 /**
@@ -680,8 +687,11 @@  static irqreturn_t vmd_irq(int irq, void *data)
 	int idx;
 
 	idx = srcu_read_lock(&irqs->srcu);
-	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
+	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node) {
+		if (interrupt_delay)
+			udelay(4);
 		generic_handle_irq(vmdirq->virq);
+	}
 	srcu_read_unlock(&irqs->srcu, idx);
 
 	return IRQ_HANDLED;
@@ -1015,6 +1025,9 @@  static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
 		vmd->first_vec = 1;
 
+	if (features & VMD_FEAT_INTERRUPT_QUIRK)
+		interrupt_delay = true;
+
 	spin_lock_init(&vmd->cfg_lock);
 	pci_set_drvdata(dev, vmd);
 	err = vmd_enable_domain(vmd, features);
@@ -1106,7 +1119,8 @@  static const struct pci_device_id vmd_ids[] = {
 	{PCI_VDEVICE(INTEL, 0xa77f),
 		.driver_data = VMD_FEATS_CLIENT,},
 	{PCI_VDEVICE(INTEL, 0x7d0b),
-		.driver_data = VMD_FEATS_CLIENT,},
+		.driver_data = VMD_FEATS_CLIENT |
+			       VMD_FEAT_INTERRUPT_QUIRK,},
 	{PCI_VDEVICE(INTEL, 0xad0b),
 		.driver_data = VMD_FEATS_CLIENT,},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),