diff mbox series

[V1,8/9] bnxt_en: Add TPH support in BNXT driver

Message ID 20240509162741.1937586-9-wei.huang2@amd.com
State New
Headers show
Series PCIe TPH and cache direct injection support | expand

Commit Message

Wei Huang May 9, 2024, 4:27 p.m. UTC
From: Manoj Panicker <manoj.panicker2@amd.com>

As a usage example, this patch implements TPH support in Broadcom BNXT
device driver by invoking pcie_tph_set_st() function when interrupt
affinity is changed.

Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Reviewed-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
 2 files changed, 55 insertions(+)

Comments

Vadim Fedorenko May 9, 2024, 9:50 p.m. UTC | #1
On 09/05/2024 17:27, Wei Huang wrote:
> From: Manoj Panicker <manoj.panicker2@amd.com>
> 
> As a usage example, this patch implements TPH support in Broadcom BNXT
> device driver by invoking pcie_tph_set_st() function when interrupt
> affinity is changed.
> 
> Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
> Reviewed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
> ---
>   drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
>   drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
>   2 files changed, 55 insertions(+)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 2c2ee79c4d77..be9c17566fb4 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -55,6 +55,7 @@
>   #include <net/page_pool/helpers.h>
>   #include <linux/align.h>
>   #include <net/netdev_queues.h>
> +#include <linux/pci-tph.h>
>   
>   #include "bnxt_hsi.h"
>   #include "bnxt.h"
> @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
>   				free_cpumask_var(irq->cpu_mask);
>   				irq->have_cpumask = 0;
>   			}
> +			irq_set_affinity_notifier(irq->vector, NULL);
>   			free_irq(irq->vector, bp->bnapi[i]);
>   		}
>   
> @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
>   	}
>   }
>   
> +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
> +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
> +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
> +				     const cpumask_t *mask)
> +{
> +	struct bnxt_irq *irq;
> +
> +	irq = container_of(notify, struct bnxt_irq, affinity_notify);
> +	cpumask_copy(irq->cpu_mask, mask);
> +
> +	if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
> +			     cpumask_first(irq->cpu_mask),
> +			     TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
> +		pr_err("error in configuring steering tag\n");
> +
> +	if (netif_running(irq->bp->dev)) {
> +		rtnl_lock();
> +		bnxt_close_nic(irq->bp, false, false);
> +		bnxt_open_nic(irq->bp, false, false);
> +		rtnl_unlock();
> +	}

Is it really needed? It will cause link flap and pause in the traffic
service for the device. Why the device needs full restart in this case?

> +}
> +
> +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
> +{
> +}
> +
> +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)

No inlines in .c files, please. Let compiler decide what to inline.

> +{
> +	struct irq_affinity_notify *notify;
> +
> +	notify = &irq->affinity_notify;
> +	notify->irq = irq->vector;
> +	notify->notify = bnxt_irq_affinity_notify;
> +	notify->release = bnxt_irq_affinity_release;
> +
> +	irq_set_affinity_notifier(irq->vector, notify);
> +}
> +
>   static int bnxt_request_irq(struct bnxt *bp)
>   {
>   	int i, j, rc = 0;
> @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
>   			int numa_node = dev_to_node(&bp->pdev->dev);
>   
>   			irq->have_cpumask = 1;
> +			irq->msix_nr = map_idx;
>   			cpumask_set_cpu(cpumask_local_spread(i, numa_node),
>   					irq->cpu_mask);
>   			rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
> @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
>   					    irq->vector);
>   				break;
>   			}
> +
> +			if (!pcie_tph_set_st(bp->pdev, i,
> +					     cpumask_first(irq->cpu_mask),
> +					     TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
> +				netdev_err(bp->dev, "error in setting steering tag\n");
> +			} else {
> +				irq->bp = bp;
> +				__bnxt_register_notify_irqchanges(irq);
> +			}
>   		}
>   	}
>   	return rc;
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> index dd849e715c9b..0d3442590bb4 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> @@ -1195,6 +1195,10 @@ struct bnxt_irq {
>   	u8		have_cpumask:1;
>   	char		name[IFNAMSIZ + 2];
>   	cpumask_var_t	cpu_mask;
> +
> +	int		msix_nr;
> +	struct bnxt	*bp;
> +	struct irq_affinity_notify affinity_notify;
>   };
>   
>   #define HWRM_RING_ALLOC_TX	0x1
Somnath Kotur May 10, 2024, 3:10 a.m. UTC | #2
On Thu, May 9, 2024 at 10:02 PM Wei Huang <wei.huang2@amd.com> wrote:
>
> From: Manoj Panicker <manoj.panicker2@amd.com>
>
> As a usage example, this patch implements TPH support in Broadcom BNXT
> device driver by invoking pcie_tph_set_st() function when interrupt
> affinity is changed.
>
> Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
> Reviewed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
> ---
>  drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
>  drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
>  2 files changed, 55 insertions(+)
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 2c2ee79c4d77..be9c17566fb4 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -55,6 +55,7 @@
>  #include <net/page_pool/helpers.h>
>  #include <linux/align.h>
>  #include <net/netdev_queues.h>
> +#include <linux/pci-tph.h>
>
>  #include "bnxt_hsi.h"
>  #include "bnxt.h"
> @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
>                                 free_cpumask_var(irq->cpu_mask);
>                                 irq->have_cpumask = 0;
>                         }
> +                       irq_set_affinity_notifier(irq->vector, NULL);
>                         free_irq(irq->vector, bp->bnapi[i]);
>                 }
>
> @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
>         }
>  }
>
> +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
> +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
> +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
> +                                    const cpumask_t *mask)
> +{
> +       struct bnxt_irq *irq;
> +
> +       irq = container_of(notify, struct bnxt_irq, affinity_notify);
> +       cpumask_copy(irq->cpu_mask, mask);
> +
> +       if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
> +                            cpumask_first(irq->cpu_mask),
> +                            TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
> +               pr_err("error in configuring steering tag\n");
> +
> +       if (netif_running(irq->bp->dev)) {
> +               rtnl_lock();
> +               bnxt_close_nic(irq->bp, false, false);
> +               bnxt_open_nic(irq->bp, false, false);
> +               rtnl_unlock();
> +       }
> +}
> +
> +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
> +{
> +}
> +
> +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
> +{
> +       struct irq_affinity_notify *notify;
> +
> +       notify = &irq->affinity_notify;
> +       notify->irq = irq->vector;
> +       notify->notify = bnxt_irq_affinity_notify;
> +       notify->release = bnxt_irq_affinity_release;
> +
> +       irq_set_affinity_notifier(irq->vector, notify);
> +}
> +
>  static int bnxt_request_irq(struct bnxt *bp)
>  {
>         int i, j, rc = 0;
> @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
>                         int numa_node = dev_to_node(&bp->pdev->dev);
>
>                         irq->have_cpumask = 1;
> +                       irq->msix_nr = map_idx;
>                         cpumask_set_cpu(cpumask_local_spread(i, numa_node),
>                                         irq->cpu_mask);
>                         rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
> @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
>                                             irq->vector);
>                                 break;
>                         }
> +
> +                       if (!pcie_tph_set_st(bp->pdev, i,
> +                                            cpumask_first(irq->cpu_mask),
> +                                            TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
> +                               netdev_err(bp->dev, "error in setting steering tag\n");
> +                       } else {
> +                               irq->bp = bp;
> +                               __bnxt_register_notify_irqchanges(irq);
> +                       }
>                 }
>         }
>         return rc;
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> index dd849e715c9b..0d3442590bb4 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> @@ -1195,6 +1195,10 @@ struct bnxt_irq {
>         u8              have_cpumask:1;
>         char            name[IFNAMSIZ + 2];
>         cpumask_var_t   cpu_mask;
> +
> +       int             msix_nr;
> +       struct bnxt     *bp;
> +       struct irq_affinity_notify affinity_notify;
>  };
>
>  #define HWRM_RING_ALLOC_TX     0x1
> --
> 2.44.0
>
>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Ajit Khaparde May 10, 2024, 3:55 a.m. UTC | #3
On Thu, May 9, 2024 at 2:50 PM Vadim Fedorenko
<vadim.fedorenko@linux.dev> wrote:
>
> On 09/05/2024 17:27, Wei Huang wrote:
> > From: Manoj Panicker <manoj.panicker2@amd.com>
> >
> > As a usage example, this patch implements TPH support in Broadcom BNXT
> > device driver by invoking pcie_tph_set_st() function when interrupt
> > affinity is changed.
> >
> > Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
> > Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
> > Reviewed-by: Wei Huang <wei.huang2@amd.com>
> > Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
> > ---
> >   drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
> >   drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
> >   2 files changed, 55 insertions(+)
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > index 2c2ee79c4d77..be9c17566fb4 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > @@ -55,6 +55,7 @@
> >   #include <net/page_pool/helpers.h>
> >   #include <linux/align.h>
> >   #include <net/netdev_queues.h>
> > +#include <linux/pci-tph.h>
> >
> >   #include "bnxt_hsi.h"
> >   #include "bnxt.h"
> > @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
> >                               free_cpumask_var(irq->cpu_mask);
> >                               irq->have_cpumask = 0;
> >                       }
> > +                     irq_set_affinity_notifier(irq->vector, NULL);
> >                       free_irq(irq->vector, bp->bnapi[i]);
> >               }
> >
> > @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
> >       }
> >   }
> >
> > +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
> > +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
> > +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
> > +                                  const cpumask_t *mask)
> > +{
> > +     struct bnxt_irq *irq;
> > +
> > +     irq = container_of(notify, struct bnxt_irq, affinity_notify);
> > +     cpumask_copy(irq->cpu_mask, mask);
> > +
> > +     if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
> > +                          cpumask_first(irq->cpu_mask),
> > +                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
> > +             pr_err("error in configuring steering tag\n");
> > +
> > +     if (netif_running(irq->bp->dev)) {
> > +             rtnl_lock();
> > +             bnxt_close_nic(irq->bp, false, false);
> > +             bnxt_open_nic(irq->bp, false, false);
> > +             rtnl_unlock();
> > +     }
>
> Is it really needed? It will cause link flap and pause in the traffic
> service for the device. Why the device needs full restart in this case?

In that sequence only the rings are recreated for the hardware to sync
up the tags.

Actually its not a full restart. There is no link reinit or other
heavy lifting in this sequence.
The pause in traffic may be momentary. Do IRQ/CPU affinities change frequently?
Probably not?

>
>
> > +}
> > +
> > +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
> > +{
> > +}
> > +
> > +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
>
> No inlines in .c files, please. Let compiler decide what to inline.
>
> > +{
> > +     struct irq_affinity_notify *notify;
> > +
> > +     notify = &irq->affinity_notify;
> > +     notify->irq = irq->vector;
> > +     notify->notify = bnxt_irq_affinity_notify;
> > +     notify->release = bnxt_irq_affinity_release;
> > +
> > +     irq_set_affinity_notifier(irq->vector, notify);
> > +}
> > +
> >   static int bnxt_request_irq(struct bnxt *bp)
> >   {
> >       int i, j, rc = 0;
> > @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
> >                       int numa_node = dev_to_node(&bp->pdev->dev);
> >
> >                       irq->have_cpumask = 1;
> > +                     irq->msix_nr = map_idx;
> >                       cpumask_set_cpu(cpumask_local_spread(i, numa_node),
> >                                       irq->cpu_mask);
> >                       rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
> > @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
> >                                           irq->vector);
> >                               break;
> >                       }
> > +
> > +                     if (!pcie_tph_set_st(bp->pdev, i,
> > +                                          cpumask_first(irq->cpu_mask),
> > +                                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
> > +                             netdev_err(bp->dev, "error in setting steering tag\n");
> > +                     } else {
> > +                             irq->bp = bp;
> > +                             __bnxt_register_notify_irqchanges(irq);
> > +                     }
> >               }
> >       }
> >       return rc;
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > index dd849e715c9b..0d3442590bb4 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > @@ -1195,6 +1195,10 @@ struct bnxt_irq {
> >       u8              have_cpumask:1;
> >       char            name[IFNAMSIZ + 2];
> >       cpumask_var_t   cpu_mask;
> > +
> > +     int             msix_nr;
> > +     struct bnxt     *bp;
> > +     struct irq_affinity_notify affinity_notify;
> >   };
> >
> >   #define HWRM_RING_ALLOC_TX  0x1
>
Vadim Fedorenko May 10, 2024, 10:35 a.m. UTC | #4
On 10.05.2024 04:55, Ajit Khaparde wrote:
> On Thu, May 9, 2024 at 2:50 PM Vadim Fedorenko
> <vadim.fedorenko@linux.dev> wrote:
>>
>> On 09/05/2024 17:27, Wei Huang wrote:
>>> From: Manoj Panicker <manoj.panicker2@amd.com>
>>>
>>> As a usage example, this patch implements TPH support in Broadcom BNXT
>>> device driver by invoking pcie_tph_set_st() function when interrupt
>>> affinity is changed.
>>>
>>> Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
>>> Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
>>> Reviewed-by: Wei Huang <wei.huang2@amd.com>
>>> Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
>>> ---
>>>    drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
>>>    drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
>>>    2 files changed, 55 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> index 2c2ee79c4d77..be9c17566fb4 100644
>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> @@ -55,6 +55,7 @@
>>>    #include <net/page_pool/helpers.h>
>>>    #include <linux/align.h>
>>>    #include <net/netdev_queues.h>
>>> +#include <linux/pci-tph.h>
>>>
>>>    #include "bnxt_hsi.h"
>>>    #include "bnxt.h"
>>> @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
>>>                                free_cpumask_var(irq->cpu_mask);
>>>                                irq->have_cpumask = 0;
>>>                        }
>>> +                     irq_set_affinity_notifier(irq->vector, NULL);
>>>                        free_irq(irq->vector, bp->bnapi[i]);
>>>                }
>>>
>>> @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
>>>        }
>>>    }
>>>
>>> +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
>>> +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
>>> +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
>>> +                                  const cpumask_t *mask)
>>> +{
>>> +     struct bnxt_irq *irq;
>>> +
>>> +     irq = container_of(notify, struct bnxt_irq, affinity_notify);
>>> +     cpumask_copy(irq->cpu_mask, mask);
>>> +
>>> +     if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
>>> +                          cpumask_first(irq->cpu_mask),
>>> +                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
>>> +             pr_err("error in configuring steering tag\n");
>>> +
>>> +     if (netif_running(irq->bp->dev)) {
>>> +             rtnl_lock();
>>> +             bnxt_close_nic(irq->bp, false, false);
>>> +             bnxt_open_nic(irq->bp, false, false);
>>> +             rtnl_unlock();
>>> +     }
>>
>> Is it really needed? It will cause link flap and pause in the traffic
>> service for the device. Why the device needs full restart in this case?
> 
> In that sequence only the rings are recreated for the hardware to sync
> up the tags.
> 
> Actually its not a full restart. There is no link reinit or other
> heavy lifting in this sequence.
> The pause in traffic may be momentary. Do IRQ/CPU affinities change frequently?
> Probably not?

 From what I can see in bnxt_en, proper validation of link_re_init parameter is
not (yet?) implemented, __bnxt_open_nic will unconditionally call 
netif_carrier_off() which will be treated as loss of carrier with counters
increment and proper events posted. Changes to CPU affinities were 
non-distruptive before the patch, but now it may break user-space assumptions.

Does FW need full rings re-init to update target value, which is one u32 write?
It looks like overkill TBH.

And yes, affinities can be change on fly according to the changes of the
workload on the host.

>>
>>
>>> +}
>>> +
>>> +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
>>> +{
>>> +}
>>> +
>>> +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
>>
>> No inlines in .c files, please. Let compiler decide what to inline.
>>
>>> +{
>>> +     struct irq_affinity_notify *notify;
>>> +
>>> +     notify = &irq->affinity_notify;
>>> +     notify->irq = irq->vector;
>>> +     notify->notify = bnxt_irq_affinity_notify;
>>> +     notify->release = bnxt_irq_affinity_release;
>>> +
>>> +     irq_set_affinity_notifier(irq->vector, notify);
>>> +}
>>> +
>>>    static int bnxt_request_irq(struct bnxt *bp)
>>>    {
>>>        int i, j, rc = 0;
>>> @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
>>>                        int numa_node = dev_to_node(&bp->pdev->dev);
>>>
>>>                        irq->have_cpumask = 1;
>>> +                     irq->msix_nr = map_idx;
>>>                        cpumask_set_cpu(cpumask_local_spread(i, numa_node),
>>>                                        irq->cpu_mask);
>>>                        rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
>>> @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
>>>                                            irq->vector);
>>>                                break;
>>>                        }
>>> +
>>> +                     if (!pcie_tph_set_st(bp->pdev, i,
>>> +                                          cpumask_first(irq->cpu_mask),
>>> +                                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
>>> +                             netdev_err(bp->dev, "error in setting steering tag\n");
>>> +                     } else {
>>> +                             irq->bp = bp;
>>> +                             __bnxt_register_notify_irqchanges(irq);
>>> +                     }
>>>                }
>>>        }
>>>        return rc;
>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>> index dd849e715c9b..0d3442590bb4 100644
>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>> @@ -1195,6 +1195,10 @@ struct bnxt_irq {
>>>        u8              have_cpumask:1;
>>>        char            name[IFNAMSIZ + 2];
>>>        cpumask_var_t   cpu_mask;
>>> +
>>> +     int             msix_nr;
>>> +     struct bnxt     *bp;
>>> +     struct irq_affinity_notify affinity_notify;
>>>    };
>>>
>>>    #define HWRM_RING_ALLOC_TX  0x1
>>
Andy Gospodarek May 10, 2024, 3:23 p.m. UTC | #5
On Fri, May 10, 2024 at 11:35:35AM +0100, Vadim Fedorenko wrote:
> On 10.05.2024 04:55, Ajit Khaparde wrote:
> > On Thu, May 9, 2024 at 2:50 PM Vadim Fedorenko
> > <vadim.fedorenko@linux.dev> wrote:
> > > 
> > > On 09/05/2024 17:27, Wei Huang wrote:
> > > > From: Manoj Panicker <manoj.panicker2@amd.com>
> > > > 
> > > > As a usage example, this patch implements TPH support in Broadcom BNXT
> > > > device driver by invoking pcie_tph_set_st() function when interrupt
> > > > affinity is changed.
> > > > 
> > > > Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
> > > > Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
> > > > Reviewed-by: Wei Huang <wei.huang2@amd.com>
> > > > Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
> > > > ---
> > > >    drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
> > > >    drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
> > > >    2 files changed, 55 insertions(+)
> > > > 
> > > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > > index 2c2ee79c4d77..be9c17566fb4 100644
> > > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > > @@ -55,6 +55,7 @@
> > > >    #include <net/page_pool/helpers.h>
> > > >    #include <linux/align.h>
> > > >    #include <net/netdev_queues.h>
> > > > +#include <linux/pci-tph.h>
> > > > 
> > > >    #include "bnxt_hsi.h"
> > > >    #include "bnxt.h"
> > > > @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
> > > >                                free_cpumask_var(irq->cpu_mask);
> > > >                                irq->have_cpumask = 0;
> > > >                        }
> > > > +                     irq_set_affinity_notifier(irq->vector, NULL);
> > > >                        free_irq(irq->vector, bp->bnapi[i]);
> > > >                }
> > > > 
> > > > @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
> > > >        }
> > > >    }
> > > > 
> > > > +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
> > > > +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
> > > > +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
> > > > +                                  const cpumask_t *mask)
> > > > +{
> > > > +     struct bnxt_irq *irq;
> > > > +
> > > > +     irq = container_of(notify, struct bnxt_irq, affinity_notify);
> > > > +     cpumask_copy(irq->cpu_mask, mask);
> > > > +
> > > > +     if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
> > > > +                          cpumask_first(irq->cpu_mask),
> > > > +                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
> > > > +             pr_err("error in configuring steering tag\n");
> > > > +
> > > > +     if (netif_running(irq->bp->dev)) {
> > > > +             rtnl_lock();
> > > > +             bnxt_close_nic(irq->bp, false, false);
> > > > +             bnxt_open_nic(irq->bp, false, false);
> > > > +             rtnl_unlock();
> > > > +     }
> > > 
> > > Is it really needed? It will cause link flap and pause in the traffic
> > > service for the device. Why the device needs full restart in this case?
> > 
> > In that sequence only the rings are recreated for the hardware to sync
> > up the tags.
> > 
> > Actually its not a full restart. There is no link reinit or other
> > heavy lifting in this sequence.
> > The pause in traffic may be momentary. Do IRQ/CPU affinities change frequently?
> > Probably not?
> 
> From what I can see in bnxt_en, proper validation of link_re_init parameter is
> not (yet?) implemented, __bnxt_open_nic will unconditionally call
> netif_carrier_off() which will be treated as loss of carrier with counters
> increment and proper events posted. Changes to CPU affinities were
> non-disruptive before the patch, but now it may break user-space
> assumptions.

From my testing the link should not flap.  I just fired up a recent net-next
and confirmed the same by calling $ ethtool -G ens7f0np0 rx 1024 which does a
similar bnxt_close_nic(bp, false, false)/bnxt_open_nic(bp, false, false) as
this patch.  Link remained up -- even with a non-Broadocm link-partner.

> Does FW need full rings re-init to update target value, which is one u32 write?
> It looks like overkill TBH.

Full rings do not, but the initialization of that particular ring associated
with this irq does need to be done.  On my list of things we need to do in
bnxt_en is implement the new ndo_queue_stop/start and ndo_queue_mem_alloc/free
operations and once those are done we could make a switch as that may be less
disruptive.

> And yes, affinities can be change on fly according to the changes of the
> workload on the host.
> 
> > > 
> > > 
> > > > +}
> > > > +
> > > > +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
> > > > +{
> > > > +}
> > > > +
> > > > +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
> > > 
> > > No inlines in .c files, please. Let compiler decide what to inline.
> > > 
> > > > +{
> > > > +     struct irq_affinity_notify *notify;
> > > > +
> > > > +     notify = &irq->affinity_notify;
> > > > +     notify->irq = irq->vector;
> > > > +     notify->notify = bnxt_irq_affinity_notify;
> > > > +     notify->release = bnxt_irq_affinity_release;
> > > > +
> > > > +     irq_set_affinity_notifier(irq->vector, notify);
> > > > +}
> > > > +
> > > >    static int bnxt_request_irq(struct bnxt *bp)
> > > >    {
> > > >        int i, j, rc = 0;
> > > > @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
> > > >                        int numa_node = dev_to_node(&bp->pdev->dev);
> > > > 
> > > >                        irq->have_cpumask = 1;
> > > > +                     irq->msix_nr = map_idx;
> > > >                        cpumask_set_cpu(cpumask_local_spread(i, numa_node),
> > > >                                        irq->cpu_mask);
> > > >                        rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
> > > > @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
> > > >                                            irq->vector);
> > > >                                break;
> > > >                        }
> > > > +
> > > > +                     if (!pcie_tph_set_st(bp->pdev, i,
> > > > +                                          cpumask_first(irq->cpu_mask),
> > > > +                                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
> > > > +                             netdev_err(bp->dev, "error in setting steering tag\n");
> > > > +                     } else {
> > > > +                             irq->bp = bp;
> > > > +                             __bnxt_register_notify_irqchanges(irq);
> > > > +                     }
> > > >                }
> > > >        }
> > > >        return rc;
> > > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > > > index dd849e715c9b..0d3442590bb4 100644
> > > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > > > @@ -1195,6 +1195,10 @@ struct bnxt_irq {
> > > >        u8              have_cpumask:1;
> > > >        char            name[IFNAMSIZ + 2];
> > > >        cpumask_var_t   cpu_mask;
> > > > +
> > > > +     int             msix_nr;
> > > > +     struct bnxt     *bp;
> > > > +     struct irq_affinity_notify affinity_notify;
> > > >    };
> > > > 
> > > >    #define HWRM_RING_ALLOC_TX  0x1
> > > 
>
David Wei May 10, 2024, 8:03 p.m. UTC | #6
On 2024-05-10 08:23, Andy Gospodarek wrote:
> On Fri, May 10, 2024 at 11:35:35AM +0100, Vadim Fedorenko wrote:
>> On 10.05.2024 04:55, Ajit Khaparde wrote:
>>> On Thu, May 9, 2024 at 2:50 PM Vadim Fedorenko
>>> <vadim.fedorenko@linux.dev> wrote:
>>>>
>>>> On 09/05/2024 17:27, Wei Huang wrote:
>>>>> From: Manoj Panicker <manoj.panicker2@amd.com>
>>>>>
>>>>> As a usage example, this patch implements TPH support in Broadcom BNXT
>>>>> device driver by invoking pcie_tph_set_st() function when interrupt
>>>>> affinity is changed.
>>>>>
>>>>> Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
>>>>> Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
>>>>> Reviewed-by: Wei Huang <wei.huang2@amd.com>
>>>>> Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
>>>>> ---
>>>>>    drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
>>>>>    drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
>>>>>    2 files changed, 55 insertions(+)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>>>> index 2c2ee79c4d77..be9c17566fb4 100644
>>>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>>>> @@ -55,6 +55,7 @@
>>>>>    #include <net/page_pool/helpers.h>
>>>>>    #include <linux/align.h>
>>>>>    #include <net/netdev_queues.h>
>>>>> +#include <linux/pci-tph.h>
>>>>>
>>>>>    #include "bnxt_hsi.h"
>>>>>    #include "bnxt.h"
>>>>> @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
>>>>>                                free_cpumask_var(irq->cpu_mask);
>>>>>                                irq->have_cpumask = 0;
>>>>>                        }
>>>>> +                     irq_set_affinity_notifier(irq->vector, NULL);
>>>>>                        free_irq(irq->vector, bp->bnapi[i]);
>>>>>                }
>>>>>
>>>>> @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
>>>>>        }
>>>>>    }
>>>>>
>>>>> +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
>>>>> +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
>>>>> +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
>>>>> +                                  const cpumask_t *mask)
>>>>> +{
>>>>> +     struct bnxt_irq *irq;
>>>>> +
>>>>> +     irq = container_of(notify, struct bnxt_irq, affinity_notify);
>>>>> +     cpumask_copy(irq->cpu_mask, mask);
>>>>> +
>>>>> +     if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
>>>>> +                          cpumask_first(irq->cpu_mask),
>>>>> +                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
>>>>> +             pr_err("error in configuring steering tag\n");
>>>>> +
>>>>> +     if (netif_running(irq->bp->dev)) {
>>>>> +             rtnl_lock();
>>>>> +             bnxt_close_nic(irq->bp, false, false);
>>>>> +             bnxt_open_nic(irq->bp, false, false);
>>>>> +             rtnl_unlock();
>>>>> +     }
>>>>
>>>> Is it really needed? It will cause link flap and pause in the traffic
>>>> service for the device. Why the device needs full restart in this case?
>>>
>>> In that sequence only the rings are recreated for the hardware to sync
>>> up the tags.
>>>
>>> Actually its not a full restart. There is no link reinit or other
>>> heavy lifting in this sequence.
>>> The pause in traffic may be momentary. Do IRQ/CPU affinities change frequently?
>>> Probably not?
>>
>> From what I can see in bnxt_en, proper validation of link_re_init parameter is
>> not (yet?) implemented, __bnxt_open_nic will unconditionally call
>> netif_carrier_off() which will be treated as loss of carrier with counters
>> increment and proper events posted. Changes to CPU affinities were
>> non-disruptive before the patch, but now it may break user-space
>> assumptions.
> 
> From my testing the link should not flap.  I just fired up a recent net-next
> and confirmed the same by calling $ ethtool -G ens7f0np0 rx 1024 which does a
> similar bnxt_close_nic(bp, false, false)/bnxt_open_nic(bp, false, false) as
> this patch.  Link remained up -- even with a non-Broadocm link-partner.
> 
>> Does FW need full rings re-init to update target value, which is one u32 write?
>> It looks like overkill TBH.
> 
> Full rings do not, but the initialization of that particular ring associated
> with this irq does need to be done.  On my list of things we need to do in
> bnxt_en is implement the new ndo_queue_stop/start and ndo_queue_mem_alloc/free
> operations and once those are done we could make a switch as that may be less
> disruptive.

Hi Andy, I have an implementation of the new ndo_queue_stop/start() API
[1] and would appreciate comments. I've been trying to test it but
without avail due to FW issues.

[1]: https://lore.kernel.org/netdev/20240502045410.3524155-1-dw@davidwei.uk/

> 
>> And yes, affinities can be change on fly according to the changes of the
>> workload on the host.
>>
>>>>
>>>>
>>>>> +}
>>>>> +
>>>>> +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
>>>>
>>>> No inlines in .c files, please. Let compiler decide what to inline.
>>>>
>>>>> +{
>>>>> +     struct irq_affinity_notify *notify;
>>>>> +
>>>>> +     notify = &irq->affinity_notify;
>>>>> +     notify->irq = irq->vector;
>>>>> +     notify->notify = bnxt_irq_affinity_notify;
>>>>> +     notify->release = bnxt_irq_affinity_release;
>>>>> +
>>>>> +     irq_set_affinity_notifier(irq->vector, notify);
>>>>> +}
>>>>> +
>>>>>    static int bnxt_request_irq(struct bnxt *bp)
>>>>>    {
>>>>>        int i, j, rc = 0;
>>>>> @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
>>>>>                        int numa_node = dev_to_node(&bp->pdev->dev);
>>>>>
>>>>>                        irq->have_cpumask = 1;
>>>>> +                     irq->msix_nr = map_idx;
>>>>>                        cpumask_set_cpu(cpumask_local_spread(i, numa_node),
>>>>>                                        irq->cpu_mask);
>>>>>                        rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
>>>>> @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
>>>>>                                            irq->vector);
>>>>>                                break;
>>>>>                        }
>>>>> +
>>>>> +                     if (!pcie_tph_set_st(bp->pdev, i,
>>>>> +                                          cpumask_first(irq->cpu_mask),
>>>>> +                                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
>>>>> +                             netdev_err(bp->dev, "error in setting steering tag\n");
>>>>> +                     } else {
>>>>> +                             irq->bp = bp;
>>>>> +                             __bnxt_register_notify_irqchanges(irq);
>>>>> +                     }
>>>>>                }
>>>>>        }
>>>>>        return rc;
>>>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>>>> index dd849e715c9b..0d3442590bb4 100644
>>>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>>>> @@ -1195,6 +1195,10 @@ struct bnxt_irq {
>>>>>        u8              have_cpumask:1;
>>>>>        char            name[IFNAMSIZ + 2];
>>>>>        cpumask_var_t   cpu_mask;
>>>>> +
>>>>> +     int             msix_nr;
>>>>> +     struct bnxt     *bp;
>>>>> +     struct irq_affinity_notify affinity_notify;
>>>>>    };
>>>>>
>>>>>    #define HWRM_RING_ALLOC_TX  0x1
>>>>
>>
Andy Gospodarek May 10, 2024, 8:33 p.m. UTC | #7
On Fri, May 10, 2024 at 01:03:50PM -0700, David Wei wrote:
> On 2024-05-10 08:23, Andy Gospodarek wrote:
> > On Fri, May 10, 2024 at 11:35:35AM +0100, Vadim Fedorenko wrote:
> >> On 10.05.2024 04:55, Ajit Khaparde wrote:
> >>> On Thu, May 9, 2024 at 2:50 PM Vadim Fedorenko
> >>> <vadim.fedorenko@linux.dev> wrote:
> >>>>
> >>>> On 09/05/2024 17:27, Wei Huang wrote:
> >>>>> From: Manoj Panicker <manoj.panicker2@amd.com>
> >>>>>
> >>>>> As a usage example, this patch implements TPH support in Broadcom BNXT
> >>>>> device driver by invoking pcie_tph_set_st() function when interrupt
> >>>>> affinity is changed.
> >>>>>
> >>>>> Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
> >>>>> Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
> >>>>> Reviewed-by: Wei Huang <wei.huang2@amd.com>
> >>>>> Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
> >>>>> ---
> >>>>>    drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
> >>>>>    drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
> >>>>>    2 files changed, 55 insertions(+)
> >>>>>
> >>>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> >>>>> index 2c2ee79c4d77..be9c17566fb4 100644
> >>>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> >>>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> >>>>> @@ -55,6 +55,7 @@
> >>>>>    #include <net/page_pool/helpers.h>
> >>>>>    #include <linux/align.h>
> >>>>>    #include <net/netdev_queues.h>
> >>>>> +#include <linux/pci-tph.h>
> >>>>>
> >>>>>    #include "bnxt_hsi.h"
> >>>>>    #include "bnxt.h"
> >>>>> @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
> >>>>>                                free_cpumask_var(irq->cpu_mask);
> >>>>>                                irq->have_cpumask = 0;
> >>>>>                        }
> >>>>> +                     irq_set_affinity_notifier(irq->vector, NULL);
> >>>>>                        free_irq(irq->vector, bp->bnapi[i]);
> >>>>>                }
> >>>>>
> >>>>> @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
> >>>>>        }
> >>>>>    }
> >>>>>
> >>>>> +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
> >>>>> +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
> >>>>> +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
> >>>>> +                                  const cpumask_t *mask)
> >>>>> +{
> >>>>> +     struct bnxt_irq *irq;
> >>>>> +
> >>>>> +     irq = container_of(notify, struct bnxt_irq, affinity_notify);
> >>>>> +     cpumask_copy(irq->cpu_mask, mask);
> >>>>> +
> >>>>> +     if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
> >>>>> +                          cpumask_first(irq->cpu_mask),
> >>>>> +                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
> >>>>> +             pr_err("error in configuring steering tag\n");
> >>>>> +
> >>>>> +     if (netif_running(irq->bp->dev)) {
> >>>>> +             rtnl_lock();
> >>>>> +             bnxt_close_nic(irq->bp, false, false);
> >>>>> +             bnxt_open_nic(irq->bp, false, false);
> >>>>> +             rtnl_unlock();
> >>>>> +     }
> >>>>
> >>>> Is it really needed? It will cause link flap and pause in the traffic
> >>>> service for the device. Why the device needs full restart in this case?
> >>>
> >>> In that sequence only the rings are recreated for the hardware to sync
> >>> up the tags.
> >>>
> >>> Actually its not a full restart. There is no link reinit or other
> >>> heavy lifting in this sequence.
> >>> The pause in traffic may be momentary. Do IRQ/CPU affinities change frequently?
> >>> Probably not?
> >>
> >> From what I can see in bnxt_en, proper validation of link_re_init parameter is
> >> not (yet?) implemented, __bnxt_open_nic will unconditionally call
> >> netif_carrier_off() which will be treated as loss of carrier with counters
> >> increment and proper events posted. Changes to CPU affinities were
> >> non-disruptive before the patch, but now it may break user-space
> >> assumptions.
> > 
> > From my testing the link should not flap.  I just fired up a recent net-next
> > and confirmed the same by calling $ ethtool -G ens7f0np0 rx 1024 which does a
> > similar bnxt_close_nic(bp, false, false)/bnxt_open_nic(bp, false, false) as
> > this patch.  Link remained up -- even with a non-Broadocm link-partner.
> > 
> >> Does FW need full rings re-init to update target value, which is one u32 write?
> >> It looks like overkill TBH.
> > 
> > Full rings do not, but the initialization of that particular ring associated
> > with this irq does need to be done.  On my list of things we need to do in
> > bnxt_en is implement the new ndo_queue_stop/start and ndo_queue_mem_alloc/free
> > operations and once those are done we could make a switch as that may be less
> > disruptive.
> 
> Hi Andy, I have an implementation of the new ndo_queue_stop/start() API
> [1] and would appreciate comments. I've been trying to test it but
> without avail due to FW issues.
> 
> [1]: https://lore.kernel.org/netdev/20240502045410.3524155-1-dw@davidwei.uk/
> 

David, I will take a look at those in more detail over the weekend or on Monday
(they are sitting in my inbox).

The overall structure looks good, but I do have at least one concern that is
related to what would need to be done in the hardware pipeline to be sure it is
safe to free packet buffers.

> > 
> >> And yes, affinities can be change on fly according to the changes of the
> >> workload on the host.
> >>
> >>>>
> >>>>
> >>>>> +}
> >>>>> +
> >>>>> +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
> >>>>> +{
> >>>>> +}
> >>>>> +
> >>>>> +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
> >>>>
> >>>> No inlines in .c files, please. Let compiler decide what to inline.
> >>>>
> >>>>> +{
> >>>>> +     struct irq_affinity_notify *notify;
> >>>>> +
> >>>>> +     notify = &irq->affinity_notify;
> >>>>> +     notify->irq = irq->vector;
> >>>>> +     notify->notify = bnxt_irq_affinity_notify;
> >>>>> +     notify->release = bnxt_irq_affinity_release;
> >>>>> +
> >>>>> +     irq_set_affinity_notifier(irq->vector, notify);
> >>>>> +}
> >>>>> +
> >>>>>    static int bnxt_request_irq(struct bnxt *bp)
> >>>>>    {
> >>>>>        int i, j, rc = 0;
> >>>>> @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
> >>>>>                        int numa_node = dev_to_node(&bp->pdev->dev);
> >>>>>
> >>>>>                        irq->have_cpumask = 1;
> >>>>> +                     irq->msix_nr = map_idx;
> >>>>>                        cpumask_set_cpu(cpumask_local_spread(i, numa_node),
> >>>>>                                        irq->cpu_mask);
> >>>>>                        rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
> >>>>> @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
> >>>>>                                            irq->vector);
> >>>>>                                break;
> >>>>>                        }
> >>>>> +
> >>>>> +                     if (!pcie_tph_set_st(bp->pdev, i,
> >>>>> +                                          cpumask_first(irq->cpu_mask),
> >>>>> +                                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
> >>>>> +                             netdev_err(bp->dev, "error in setting steering tag\n");
> >>>>> +                     } else {
> >>>>> +                             irq->bp = bp;
> >>>>> +                             __bnxt_register_notify_irqchanges(irq);
> >>>>> +                     }
> >>>>>                }
> >>>>>        }
> >>>>>        return rc;
> >>>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> >>>>> index dd849e715c9b..0d3442590bb4 100644
> >>>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> >>>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> >>>>> @@ -1195,6 +1195,10 @@ struct bnxt_irq {
> >>>>>        u8              have_cpumask:1;
> >>>>>        char            name[IFNAMSIZ + 2];
> >>>>>        cpumask_var_t   cpu_mask;
> >>>>> +
> >>>>> +     int             msix_nr;
> >>>>> +     struct bnxt     *bp;
> >>>>> +     struct irq_affinity_notify affinity_notify;
> >>>>>    };
> >>>>>
> >>>>>    #define HWRM_RING_ALLOC_TX  0x1
> >>>>
> >>
Vadim Fedorenko May 10, 2024, 8:33 p.m. UTC | #8
On 10/05/2024 16:23, Andy Gospodarek wrote:
> On Fri, May 10, 2024 at 11:35:35AM +0100, Vadim Fedorenko wrote:
>> On 10.05.2024 04:55, Ajit Khaparde wrote:
>>> On Thu, May 9, 2024 at 2:50 PM Vadim Fedorenko
>>> <vadim.fedorenko@linux.dev> wrote:
>>>>
>>>> On 09/05/2024 17:27, Wei Huang wrote:
>>>>> From: Manoj Panicker <manoj.panicker2@amd.com>
>>>>>
>>>>> As a usage example, this patch implements TPH support in Broadcom BNXT
>>>>> device driver by invoking pcie_tph_set_st() function when interrupt
>>>>> affinity is changed.
>>>>>
>>>>> Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
>>>>> Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
>>>>> Reviewed-by: Wei Huang <wei.huang2@amd.com>
>>>>> Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
>>>>> ---
>>>>>     drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
>>>>>     drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
>>>>>     2 files changed, 55 insertions(+)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>>>> index 2c2ee79c4d77..be9c17566fb4 100644
>>>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>>>> @@ -55,6 +55,7 @@
>>>>>     #include <net/page_pool/helpers.h>
>>>>>     #include <linux/align.h>
>>>>>     #include <net/netdev_queues.h>
>>>>> +#include <linux/pci-tph.h>
>>>>>
>>>>>     #include "bnxt_hsi.h"
>>>>>     #include "bnxt.h"
>>>>> @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
>>>>>                                 free_cpumask_var(irq->cpu_mask);
>>>>>                                 irq->have_cpumask = 0;
>>>>>                         }
>>>>> +                     irq_set_affinity_notifier(irq->vector, NULL);
>>>>>                         free_irq(irq->vector, bp->bnapi[i]);
>>>>>                 }
>>>>>
>>>>> @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
>>>>>         }
>>>>>     }
>>>>>
>>>>> +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
>>>>> +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
>>>>> +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
>>>>> +                                  const cpumask_t *mask)
>>>>> +{
>>>>> +     struct bnxt_irq *irq;
>>>>> +
>>>>> +     irq = container_of(notify, struct bnxt_irq, affinity_notify);
>>>>> +     cpumask_copy(irq->cpu_mask, mask);
>>>>> +
>>>>> +     if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
>>>>> +                          cpumask_first(irq->cpu_mask),
>>>>> +                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
>>>>> +             pr_err("error in configuring steering tag\n");
>>>>> +
>>>>> +     if (netif_running(irq->bp->dev)) {
>>>>> +             rtnl_lock();
>>>>> +             bnxt_close_nic(irq->bp, false, false);
>>>>> +             bnxt_open_nic(irq->bp, false, false);
>>>>> +             rtnl_unlock();
>>>>> +     }
>>>>
>>>> Is it really needed? It will cause link flap and pause in the traffic
>>>> service for the device. Why the device needs full restart in this case?
>>>
>>> In that sequence only the rings are recreated for the hardware to sync
>>> up the tags.
>>>
>>> Actually its not a full restart. There is no link reinit or other
>>> heavy lifting in this sequence.
>>> The pause in traffic may be momentary. Do IRQ/CPU affinities change frequently?
>>> Probably not?
>>
>>  From what I can see in bnxt_en, proper validation of link_re_init parameter is
>> not (yet?) implemented, __bnxt_open_nic will unconditionally call
>> netif_carrier_off() which will be treated as loss of carrier with counters
>> increment and proper events posted. Changes to CPU affinities were
>> non-disruptive before the patch, but now it may break user-space
>> assumptions.
> 
>  From my testing the link should not flap.  I just fired up a recent net-next
> and confirmed the same by calling $ ethtool -G ens7f0np0 rx 1024 which does a
> similar bnxt_close_nic(bp, false, false)/bnxt_open_nic(bp, false, false) as
> this patch.  Link remained up -- even with a non-Broadocm link-partner.

Hi Andy!

Well, it might be that from phy PoV the link didn't flap, but from 
network subsystem it does flap:

[root@host ~]# ethtool -G eth0 rx 512
[root@host ~]# cat /sys/class/net/eth0/carrier_changes
6
[root@host ~]# ethtool -G eth0 rx 1024
[root@host ~]# cat /sys/class/net/eth0/carrier_changes
8

And this is what I'm referring to when talking about user-space 
experience. But I would like to see new ndo_queue_stop/start 
implementation, it may help in this situation.

>> Does FW need full rings re-init to update target value, which is one u32 write?
>> It looks like overkill TBH.
> 
> Full rings do not, but the initialization of that particular ring associated
> with this irq does need to be done.  On my list of things we need to do in
> bnxt_en is implement the new ndo_queue_stop/start and ndo_queue_mem_alloc/free
> operations and once those are done we could make a switch as that may be less
> disruptive.
> 
>> And yes, affinities can be change on fly according to the changes of the
>> workload on the host.
>>
>>>>
>>>>
>>>>> +}
>>>>> +
>>>>> +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
>>>>
>>>> No inlines in .c files, please. Let compiler decide what to inline.
>>>>
>>>>> +{
>>>>> +     struct irq_affinity_notify *notify;
>>>>> +
>>>>> +     notify = &irq->affinity_notify;
>>>>> +     notify->irq = irq->vector;
>>>>> +     notify->notify = bnxt_irq_affinity_notify;
>>>>> +     notify->release = bnxt_irq_affinity_release;
>>>>> +
>>>>> +     irq_set_affinity_notifier(irq->vector, notify);
>>>>> +}
>>>>> +
>>>>>     static int bnxt_request_irq(struct bnxt *bp)
>>>>>     {
>>>>>         int i, j, rc = 0;
>>>>> @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
>>>>>                         int numa_node = dev_to_node(&bp->pdev->dev);
>>>>>
>>>>>                         irq->have_cpumask = 1;
>>>>> +                     irq->msix_nr = map_idx;
>>>>>                         cpumask_set_cpu(cpumask_local_spread(i, numa_node),
>>>>>                                         irq->cpu_mask);
>>>>>                         rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
>>>>> @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
>>>>>                                             irq->vector);
>>>>>                                 break;
>>>>>                         }
>>>>> +
>>>>> +                     if (!pcie_tph_set_st(bp->pdev, i,
>>>>> +                                          cpumask_first(irq->cpu_mask),
>>>>> +                                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
>>>>> +                             netdev_err(bp->dev, "error in setting steering tag\n");
>>>>> +                     } else {
>>>>> +                             irq->bp = bp;
>>>>> +                             __bnxt_register_notify_irqchanges(irq);
>>>>> +                     }
>>>>>                 }
>>>>>         }
>>>>>         return rc;
>>>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>>>> index dd849e715c9b..0d3442590bb4 100644
>>>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
>>>>> @@ -1195,6 +1195,10 @@ struct bnxt_irq {
>>>>>         u8              have_cpumask:1;
>>>>>         char            name[IFNAMSIZ + 2];
>>>>>         cpumask_var_t   cpu_mask;
>>>>> +
>>>>> +     int             msix_nr;
>>>>> +     struct bnxt     *bp;
>>>>> +     struct irq_affinity_notify affinity_notify;
>>>>>     };
>>>>>
>>>>>     #define HWRM_RING_ALLOC_TX  0x1
>>>>
>>
Andy Gospodarek May 10, 2024, 8:37 p.m. UTC | #9
On Fri, May 10, 2024 at 09:33:46PM +0100, Vadim Fedorenko wrote:
> On 10/05/2024 16:23, Andy Gospodarek wrote:
> > On Fri, May 10, 2024 at 11:35:35AM +0100, Vadim Fedorenko wrote:
> > > On 10.05.2024 04:55, Ajit Khaparde wrote:
> > > > On Thu, May 9, 2024 at 2:50 PM Vadim Fedorenko
> > > > <vadim.fedorenko@linux.dev> wrote:
> > > > > 
> > > > > On 09/05/2024 17:27, Wei Huang wrote:
> > > > > > From: Manoj Panicker <manoj.panicker2@amd.com>
> > > > > > 
> > > > > > As a usage example, this patch implements TPH support in Broadcom BNXT
> > > > > > device driver by invoking pcie_tph_set_st() function when interrupt
> > > > > > affinity is changed.
> > > > > > 
> > > > > > Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
> > > > > > Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
> > > > > > Reviewed-by: Wei Huang <wei.huang2@amd.com>
> > > > > > Signed-off-by: Manoj Panicker <manoj.panicker2@amd.com>
> > > > > > ---
> > > > > >     drivers/net/ethernet/broadcom/bnxt/bnxt.c | 51 +++++++++++++++++++++++
> > > > > >     drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 ++
> > > > > >     2 files changed, 55 insertions(+)
> > > > > > 
> > > > > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > > > > index 2c2ee79c4d77..be9c17566fb4 100644
> > > > > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > > > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > > > > @@ -55,6 +55,7 @@
> > > > > >     #include <net/page_pool/helpers.h>
> > > > > >     #include <linux/align.h>
> > > > > >     #include <net/netdev_queues.h>
> > > > > > +#include <linux/pci-tph.h>
> > > > > > 
> > > > > >     #include "bnxt_hsi.h"
> > > > > >     #include "bnxt.h"
> > > > > > @@ -10491,6 +10492,7 @@ static void bnxt_free_irq(struct bnxt *bp)
> > > > > >                                 free_cpumask_var(irq->cpu_mask);
> > > > > >                                 irq->have_cpumask = 0;
> > > > > >                         }
> > > > > > +                     irq_set_affinity_notifier(irq->vector, NULL);
> > > > > >                         free_irq(irq->vector, bp->bnapi[i]);
> > > > > >                 }
> > > > > > 
> > > > > > @@ -10498,6 +10500,45 @@ static void bnxt_free_irq(struct bnxt *bp)
> > > > > >         }
> > > > > >     }
> > > > > > 
> > > > > > +static void bnxt_rtnl_lock_sp(struct bnxt *bp);
> > > > > > +static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
> > > > > > +static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
> > > > > > +                                  const cpumask_t *mask)
> > > > > > +{
> > > > > > +     struct bnxt_irq *irq;
> > > > > > +
> > > > > > +     irq = container_of(notify, struct bnxt_irq, affinity_notify);
> > > > > > +     cpumask_copy(irq->cpu_mask, mask);
> > > > > > +
> > > > > > +     if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
> > > > > > +                          cpumask_first(irq->cpu_mask),
> > > > > > +                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
> > > > > > +             pr_err("error in configuring steering tag\n");
> > > > > > +
> > > > > > +     if (netif_running(irq->bp->dev)) {
> > > > > > +             rtnl_lock();
> > > > > > +             bnxt_close_nic(irq->bp, false, false);
> > > > > > +             bnxt_open_nic(irq->bp, false, false);
> > > > > > +             rtnl_unlock();
> > > > > > +     }
> > > > > 
> > > > > Is it really needed? It will cause link flap and pause in the traffic
> > > > > service for the device. Why the device needs full restart in this case?
> > > > 
> > > > In that sequence only the rings are recreated for the hardware to sync
> > > > up the tags.
> > > > 
> > > > Actually its not a full restart. There is no link reinit or other
> > > > heavy lifting in this sequence.
> > > > The pause in traffic may be momentary. Do IRQ/CPU affinities change frequently?
> > > > Probably not?
> > > 
> > >  From what I can see in bnxt_en, proper validation of link_re_init parameter is
> > > not (yet?) implemented, __bnxt_open_nic will unconditionally call
> > > netif_carrier_off() which will be treated as loss of carrier with counters
> > > increment and proper events posted. Changes to CPU affinities were
> > > non-disruptive before the patch, but now it may break user-space
> > > assumptions.
> > 
> >  From my testing the link should not flap.  I just fired up a recent net-next
> > and confirmed the same by calling $ ethtool -G ens7f0np0 rx 1024 which does a
> > similar bnxt_close_nic(bp, false, false)/bnxt_open_nic(bp, false, false) as
> > this patch.  Link remained up -- even with a non-Broadocm link-partner.
> 
> Hi Andy!
> 
> Well, it might be that from phy PoV the link didn't flap, but from network
> subsystem it does flap:
> 
> [root@host ~]# ethtool -G eth0 rx 512
> [root@host ~]# cat /sys/class/net/eth0/carrier_changes
> 6
> [root@host ~]# ethtool -G eth0 rx 1024
> [root@host ~]# cat /sys/class/net/eth0/carrier_changes
> 8

Fair point :)

I also think that we should skip doing the bnxt_close_nic/bnxt_open_nic
except when TPH is possible on the system.  That should mitigate some of
these concerns for users who do not have it enabled.

> And this is what I'm referring to when talking about user-space experience.
> But I would like to see new ndo_queue_stop/start implementation, it may help
> in this situation.
> 
> > > Does FW need full rings re-init to update target value, which is one u32 write?
> > > It looks like overkill TBH.
> > 
> > Full rings do not, but the initialization of that particular ring associated
> > with this irq does need to be done.  On my list of things we need to do in
> > bnxt_en is implement the new ndo_queue_stop/start and ndo_queue_mem_alloc/free
> > operations and once those are done we could make a switch as that may be less
> > disruptive.
> > 
> > > And yes, affinities can be change on fly according to the changes of the
> > > workload on the host.
> > > 
> > > > > 
> > > > > 
> > > > > > +}
> > > > > > +
> > > > > > +static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
> > > > > > +{
> > > > > > +}
> > > > > > +
> > > > > > +static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
> > > > > 
> > > > > No inlines in .c files, please. Let compiler decide what to inline.
> > > > > 
> > > > > > +{
> > > > > > +     struct irq_affinity_notify *notify;
> > > > > > +
> > > > > > +     notify = &irq->affinity_notify;
> > > > > > +     notify->irq = irq->vector;
> > > > > > +     notify->notify = bnxt_irq_affinity_notify;
> > > > > > +     notify->release = bnxt_irq_affinity_release;
> > > > > > +
> > > > > > +     irq_set_affinity_notifier(irq->vector, notify);
> > > > > > +}
> > > > > > +
> > > > > >     static int bnxt_request_irq(struct bnxt *bp)
> > > > > >     {
> > > > > >         int i, j, rc = 0;
> > > > > > @@ -10543,6 +10584,7 @@ static int bnxt_request_irq(struct bnxt *bp)
> > > > > >                         int numa_node = dev_to_node(&bp->pdev->dev);
> > > > > > 
> > > > > >                         irq->have_cpumask = 1;
> > > > > > +                     irq->msix_nr = map_idx;
> > > > > >                         cpumask_set_cpu(cpumask_local_spread(i, numa_node),
> > > > > >                                         irq->cpu_mask);
> > > > > >                         rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
> > > > > > @@ -10552,6 +10594,15 @@ static int bnxt_request_irq(struct bnxt *bp)
> > > > > >                                             irq->vector);
> > > > > >                                 break;
> > > > > >                         }
> > > > > > +
> > > > > > +                     if (!pcie_tph_set_st(bp->pdev, i,
> > > > > > +                                          cpumask_first(irq->cpu_mask),
> > > > > > +                                          TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
> > > > > > +                             netdev_err(bp->dev, "error in setting steering tag\n");
> > > > > > +                     } else {
> > > > > > +                             irq->bp = bp;
> > > > > > +                             __bnxt_register_notify_irqchanges(irq);
> > > > > > +                     }
> > > > > >                 }
> > > > > >         }
> > > > > >         return rc;
> > > > > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > > > > > index dd849e715c9b..0d3442590bb4 100644
> > > > > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > > > > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > > > > > @@ -1195,6 +1195,10 @@ struct bnxt_irq {
> > > > > >         u8              have_cpumask:1;
> > > > > >         char            name[IFNAMSIZ + 2];
> > > > > >         cpumask_var_t   cpu_mask;
> > > > > > +
> > > > > > +     int             msix_nr;
> > > > > > +     struct bnxt     *bp;
> > > > > > +     struct irq_affinity_notify affinity_notify;
> > > > > >     };
> > > > > > 
> > > > > >     #define HWRM_RING_ALLOC_TX  0x1
> > > > > 
> > > 
>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 2c2ee79c4d77..be9c17566fb4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -55,6 +55,7 @@ 
 #include <net/page_pool/helpers.h>
 #include <linux/align.h>
 #include <net/netdev_queues.h>
+#include <linux/pci-tph.h>
 
 #include "bnxt_hsi.h"
 #include "bnxt.h"
@@ -10491,6 +10492,7 @@  static void bnxt_free_irq(struct bnxt *bp)
 				free_cpumask_var(irq->cpu_mask);
 				irq->have_cpumask = 0;
 			}
+			irq_set_affinity_notifier(irq->vector, NULL);
 			free_irq(irq->vector, bp->bnapi[i]);
 		}
 
@@ -10498,6 +10500,45 @@  static void bnxt_free_irq(struct bnxt *bp)
 	}
 }
 
+static void bnxt_rtnl_lock_sp(struct bnxt *bp);
+static void bnxt_rtnl_unlock_sp(struct bnxt *bp);
+static void bnxt_irq_affinity_notify(struct irq_affinity_notify *notify,
+				     const cpumask_t *mask)
+{
+	struct bnxt_irq *irq;
+
+	irq = container_of(notify, struct bnxt_irq, affinity_notify);
+	cpumask_copy(irq->cpu_mask, mask);
+
+	if (!pcie_tph_set_st(irq->bp->pdev, irq->msix_nr,
+			     cpumask_first(irq->cpu_mask),
+			     TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY))
+		pr_err("error in configuring steering tag\n");
+
+	if (netif_running(irq->bp->dev)) {
+		rtnl_lock();
+		bnxt_close_nic(irq->bp, false, false);
+		bnxt_open_nic(irq->bp, false, false);
+		rtnl_unlock();
+	}
+}
+
+static void bnxt_irq_affinity_release(struct kref __always_unused *ref)
+{
+}
+
+static inline void __bnxt_register_notify_irqchanges(struct bnxt_irq *irq)
+{
+	struct irq_affinity_notify *notify;
+
+	notify = &irq->affinity_notify;
+	notify->irq = irq->vector;
+	notify->notify = bnxt_irq_affinity_notify;
+	notify->release = bnxt_irq_affinity_release;
+
+	irq_set_affinity_notifier(irq->vector, notify);
+}
+
 static int bnxt_request_irq(struct bnxt *bp)
 {
 	int i, j, rc = 0;
@@ -10543,6 +10584,7 @@  static int bnxt_request_irq(struct bnxt *bp)
 			int numa_node = dev_to_node(&bp->pdev->dev);
 
 			irq->have_cpumask = 1;
+			irq->msix_nr = map_idx;
 			cpumask_set_cpu(cpumask_local_spread(i, numa_node),
 					irq->cpu_mask);
 			rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
@@ -10552,6 +10594,15 @@  static int bnxt_request_irq(struct bnxt *bp)
 					    irq->vector);
 				break;
 			}
+
+			if (!pcie_tph_set_st(bp->pdev, i,
+					     cpumask_first(irq->cpu_mask),
+					     TPH_MEM_TYPE_VM, PCI_TPH_REQ_TPH_ONLY)) {
+				netdev_err(bp->dev, "error in setting steering tag\n");
+			} else {
+				irq->bp = bp;
+				__bnxt_register_notify_irqchanges(irq);
+			}
 		}
 	}
 	return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index dd849e715c9b..0d3442590bb4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1195,6 +1195,10 @@  struct bnxt_irq {
 	u8		have_cpumask:1;
 	char		name[IFNAMSIZ + 2];
 	cpumask_var_t	cpu_mask;
+
+	int		msix_nr;
+	struct bnxt	*bp;
+	struct irq_affinity_notify affinity_notify;
 };
 
 #define HWRM_RING_ALLOC_TX	0x1