diff mbox series

[iwl-next,v4,2/2] igc: Link queues to NAPI instances

Message ID 20241022215246.307821-3-jdamato@fastly.com
State Under Review
Delegated to: Anthony Nguyen
Headers show
Series igc: Link IRQs and queues to NAPIs | expand

Commit Message

Joe Damato Oct. 22, 2024, 9:52 p.m. UTC
Link queues to NAPI instances via netdev-genl API so that users can
query this information with netlink. Handle a few cases in the driver:
  1. Link/unlink the NAPIs when XDP is enabled/disabled
  2. Handle IGC_FLAG_QUEUE_PAIRS enabled and disabled

Example output when IGC_FLAG_QUEUE_PAIRS is enabled:

$ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
                         --dump queue-get --json='{"ifindex": 2}'

[{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
 {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
 {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'rx'},
 {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'rx'},
 {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
 {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'tx'},
 {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
 {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]

Since IGC_FLAG_QUEUE_PAIRS is enabled, you'll note that the same NAPI ID
is present for both rx and tx queues at the same index, for example
index 0:

{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},

To test IGC_FLAG_QUEUE_PAIRS disabled, a test system was booted using
the grub command line option "maxcpus=2" to force
igc_set_interrupt_capability to disable IGC_FLAG_QUEUE_PAIRS.

Example output when IGC_FLAG_QUEUE_PAIRS is disabled:

$ lscpu | grep "On-line CPU"
On-line CPU(s) list:      0,2

$ ethtool -l enp86s0  | tail -5
Current hardware settings:
RX:		n/a
TX:		n/a
Other:		1
Combined:	2

$ cat /proc/interrupts  | grep enp
 144: [...] enp86s0
 145: [...] enp86s0-rx-0
 146: [...] enp86s0-rx-1
 147: [...] enp86s0-tx-0
 148: [...] enp86s0-tx-1

1 "other" IRQ, and 2 IRQs for each of RX and Tx, so we expect netlink to
report 4 IRQs with unique NAPI IDs:

$ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
                         --dump napi-get --json='{"ifindex": 2}'
[{'id': 8196, 'ifindex': 2, 'irq': 148},
 {'id': 8195, 'ifindex': 2, 'irq': 147},
 {'id': 8194, 'ifindex': 2, 'irq': 146},
 {'id': 8193, 'ifindex': 2, 'irq': 145}]

Now we examine which queues these NAPIs are associated with, expecting
that since IGC_FLAG_QUEUE_PAIRS is disabled each RX and TX queue will
have its own NAPI instance:

$ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
                         --dump queue-get --json='{"ifindex": 2}'
[{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
 {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
 {'id': 0, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
 {'id': 1, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]

Signed-off-by: Joe Damato <jdamato@fastly.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 v4:
   - Add rtnl_lock/rtnl_unlock in two paths: igc_resume and
     igc_io_error_detected. The code added to the latter is inspired by
     a similar implementation in ixgbe's ixgbe_io_error_detected.

 v3:
   - Replace igc_unset_queue_napi with igc_set_queue_napi(adapater, i,
     NULL), as suggested by Vinicius Costa Gomes
   - Simplify implemention of igc_set_queue_napi as suggested by Kurt
     Kanzenbach, with a tweak to use ring->queue_index

 v2:
   - Update commit message to include tests for IGC_FLAG_QUEUE_PAIRS
     disabled
   - Refactored code to move napi queue mapping and unmapping to helper
     functions igc_set_queue_napi and igc_unset_queue_napi
   - Adjust the code to handle IGC_FLAG_QUEUE_PAIRS disabled
   - Call helpers to map/unmap queues to NAPIs in igc_up, __igc_open,
     igc_xdp_enable_pool, and igc_xdp_disable_pool

 drivers/net/ethernet/intel/igc/igc.h      |  2 ++
 drivers/net/ethernet/intel/igc/igc_main.c | 41 ++++++++++++++++++++---
 drivers/net/ethernet/intel/igc/igc_xdp.c  |  2 ++
 3 files changed, 40 insertions(+), 5 deletions(-)

Comments

Lifshits, Vitaly Oct. 27, 2024, 9:49 a.m. UTC | #1
On 10/23/2024 12:52 AM, Joe Damato wrote:
> Link queues to NAPI instances via netdev-genl API so that users can
> query this information with netlink. Handle a few cases in the driver:
>    1. Link/unlink the NAPIs when XDP is enabled/disabled
>    2. Handle IGC_FLAG_QUEUE_PAIRS enabled and disabled
> 
> Example output when IGC_FLAG_QUEUE_PAIRS is enabled:
> 
> $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
>                           --dump queue-get --json='{"ifindex": 2}'
> 
> [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
>   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
>   {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'rx'},
>   {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'rx'},
>   {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
>   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'tx'},
>   {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
>   {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]
> 
> Since IGC_FLAG_QUEUE_PAIRS is enabled, you'll note that the same NAPI ID
> is present for both rx and tx queues at the same index, for example
> index 0:
> 
> {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
> 
> To test IGC_FLAG_QUEUE_PAIRS disabled, a test system was booted using
> the grub command line option "maxcpus=2" to force
> igc_set_interrupt_capability to disable IGC_FLAG_QUEUE_PAIRS.
> 
> Example output when IGC_FLAG_QUEUE_PAIRS is disabled:
> 
> $ lscpu | grep "On-line CPU"
> On-line CPU(s) list:      0,2
> 
> $ ethtool -l enp86s0  | tail -5
> Current hardware settings:
> RX:		n/a
> TX:		n/a
> Other:		1
> Combined:	2
> 
> $ cat /proc/interrupts  | grep enp
>   144: [...] enp86s0
>   145: [...] enp86s0-rx-0
>   146: [...] enp86s0-rx-1
>   147: [...] enp86s0-tx-0
>   148: [...] enp86s0-tx-1
> 
> 1 "other" IRQ, and 2 IRQs for each of RX and Tx, so we expect netlink to
> report 4 IRQs with unique NAPI IDs:
> 
> $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
>                           --dump napi-get --json='{"ifindex": 2}'
> [{'id': 8196, 'ifindex': 2, 'irq': 148},
>   {'id': 8195, 'ifindex': 2, 'irq': 147},
>   {'id': 8194, 'ifindex': 2, 'irq': 146},
>   {'id': 8193, 'ifindex': 2, 'irq': 145}]
> 
> Now we examine which queues these NAPIs are associated with, expecting
> that since IGC_FLAG_QUEUE_PAIRS is disabled each RX and TX queue will
> have its own NAPI instance:
> 
> $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
>                           --dump queue-get --json='{"ifindex": 2}'
> [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
>   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
>   {'id': 0, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
>   {'id': 1, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> ---
>   v4:
>     - Add rtnl_lock/rtnl_unlock in two paths: igc_resume and
>       igc_io_error_detected. The code added to the latter is inspired by
>       a similar implementation in ixgbe's ixgbe_io_error_detected.
> 
>   v3:
>     - Replace igc_unset_queue_napi with igc_set_queue_napi(adapater, i,
>       NULL), as suggested by Vinicius Costa Gomes
>     - Simplify implemention of igc_set_queue_napi as suggested by Kurt
>       Kanzenbach, with a tweak to use ring->queue_index
> 
>   v2:
>     - Update commit message to include tests for IGC_FLAG_QUEUE_PAIRS
>       disabled
>     - Refactored code to move napi queue mapping and unmapping to helper
>       functions igc_set_queue_napi and igc_unset_queue_napi
>     - Adjust the code to handle IGC_FLAG_QUEUE_PAIRS disabled
>     - Call helpers to map/unmap queues to NAPIs in igc_up, __igc_open,
>       igc_xdp_enable_pool, and igc_xdp_disable_pool
> 
>   drivers/net/ethernet/intel/igc/igc.h      |  2 ++
>   drivers/net/ethernet/intel/igc/igc_main.c | 41 ++++++++++++++++++++---
>   drivers/net/ethernet/intel/igc/igc_xdp.c  |  2 ++
>   3 files changed, 40 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> index eac0f966e0e4..b8111ad9a9a8 100644
> --- a/drivers/net/ethernet/intel/igc/igc.h
> +++ b/drivers/net/ethernet/intel/igc/igc.h
> @@ -337,6 +337,8 @@ struct igc_adapter {
>   	struct igc_led_classdev *leds;
>   };
>   
> +void igc_set_queue_napi(struct igc_adapter *adapter, int q_idx,
> +			struct napi_struct *napi);
>   void igc_up(struct igc_adapter *adapter);
>   void igc_down(struct igc_adapter *adapter);
>   int igc_open(struct net_device *netdev);
> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> index 7964bbedb16c..04aa216ef612 100644
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -4948,6 +4948,22 @@ static int igc_sw_init(struct igc_adapter *adapter)
>   	return 0;
>   }
>   
> +void igc_set_queue_napi(struct igc_adapter *adapter, int vector,
> +			struct napi_struct *napi)
> +{
> +	struct igc_q_vector *q_vector = adapter->q_vector[vector];
> +
> +	if (q_vector->rx.ring)
> +		netif_queue_set_napi(adapter->netdev,
> +				     q_vector->rx.ring->queue_index,
> +				     NETDEV_QUEUE_TYPE_RX, napi);
> +
> +	if (q_vector->tx.ring)
> +		netif_queue_set_napi(adapter->netdev,
> +				     q_vector->tx.ring->queue_index,
> +				     NETDEV_QUEUE_TYPE_TX, napi);
> +}
> +
>   /**
>    * igc_up - Open the interface and prepare it to handle traffic
>    * @adapter: board private structure
> @@ -4955,6 +4971,7 @@ static int igc_sw_init(struct igc_adapter *adapter)
>   void igc_up(struct igc_adapter *adapter)
>   {
>   	struct igc_hw *hw = &adapter->hw;
> +	struct napi_struct *napi;
>   	int i = 0;
>   
>   	/* hardware has been reset, we need to reload some things */
> @@ -4962,8 +4979,11 @@ void igc_up(struct igc_adapter *adapter)
>   
>   	clear_bit(__IGC_DOWN, &adapter->state);
>   
> -	for (i = 0; i < adapter->num_q_vectors; i++)
> -		napi_enable(&adapter->q_vector[i]->napi);
> +	for (i = 0; i < adapter->num_q_vectors; i++) {
> +		napi = &adapter->q_vector[i]->napi;
> +		napi_enable(napi);
> +		igc_set_queue_napi(adapter, i, napi);
> +	}
>   
>   	if (adapter->msix_entries)
>   		igc_configure_msix(adapter);
> @@ -5192,6 +5212,7 @@ void igc_down(struct igc_adapter *adapter)
>   	for (i = 0; i < adapter->num_q_vectors; i++) {
>   		if (adapter->q_vector[i]) {
>   			napi_synchronize(&adapter->q_vector[i]->napi);
> +			igc_set_queue_napi(adapter, i, NULL);
>   			napi_disable(&adapter->q_vector[i]->napi);
>   		}
>   	}
> @@ -6021,6 +6042,7 @@ static int __igc_open(struct net_device *netdev, bool resuming)
>   	struct igc_adapter *adapter = netdev_priv(netdev);
>   	struct pci_dev *pdev = adapter->pdev;
>   	struct igc_hw *hw = &adapter->hw;
> +	struct napi_struct *napi;
>   	int err = 0;
>   	int i = 0;
>   
> @@ -6056,8 +6078,11 @@ static int __igc_open(struct net_device *netdev, bool resuming)
>   
>   	clear_bit(__IGC_DOWN, &adapter->state);
>   
> -	for (i = 0; i < adapter->num_q_vectors; i++)
> -		napi_enable(&adapter->q_vector[i]->napi);
> +	for (i = 0; i < adapter->num_q_vectors; i++) {
> +		napi = &adapter->q_vector[i]->napi;
> +		napi_enable(napi);
> +		igc_set_queue_napi(adapter, i, napi);
> +	}
>   
>   	/* Clear any pending interrupts. */
>   	rd32(IGC_ICR);
> @@ -7385,7 +7410,9 @@ static int igc_resume(struct device *dev)
>   	wr32(IGC_WUS, ~0);
>   
>   	if (netif_running(netdev)) {
> +		rtnl_lock();

This change will bring back the deadlock issue that was fixed in commit:
6f31d6b: "igc: Refactor runtime power management flow".

>   		err = __igc_open(netdev, true);
> +		rtnl_unlock();
>   		if (!err)
>   			netif_device_attach(netdev);
>   	}
> @@ -7440,14 +7467,18 @@ static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev,
>   	struct net_device *netdev = pci_get_drvdata(pdev);
>   	struct igc_adapter *adapter = netdev_priv(netdev);
>   
> +	rtnl_lock();
>   	netif_device_detach(netdev);
>   
> -	if (state == pci_channel_io_perm_failure)
> +	if (state == pci_channel_io_perm_failure) {
> +		rtnl_unlock();
>   		return PCI_ERS_RESULT_DISCONNECT;
> +	}
>   
>   	if (netif_running(netdev))
>   		igc_down(adapter);
>   	pci_disable_device(pdev);
> +	rtnl_unlock();
>   
>   	/* Request a slot reset. */
>   	return PCI_ERS_RESULT_NEED_RESET;
> diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c b/drivers/net/ethernet/intel/igc/igc_xdp.c
> index e27af72aada8..4da633430b80 100644
> --- a/drivers/net/ethernet/intel/igc/igc_xdp.c
> +++ b/drivers/net/ethernet/intel/igc/igc_xdp.c
> @@ -84,6 +84,7 @@ static int igc_xdp_enable_pool(struct igc_adapter *adapter,
>   		napi_disable(napi);
>   	}
>   
> +	igc_set_queue_napi(adapter, queue_id, NULL);
>   	set_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
>   	set_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
>   
> @@ -133,6 +134,7 @@ static int igc_xdp_disable_pool(struct igc_adapter *adapter, u16 queue_id)
>   	xsk_pool_dma_unmap(pool, IGC_RX_DMA_ATTR);
>   	clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
>   	clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
> +	igc_set_queue_napi(adapter, queue_id, napi);
>   
>   	if (needs_reset) {
>   		napi_enable(napi);
> 



Hi Joe,


The current version will cause a regression, a possible deadlock, due to 
the addition of the rtnl_lock in igc_resume that was fixed previously.

You can refer to the following link:

https://github.com/torvalds/linux/commit/6f31d6b643a32cc126cf86093fca1ea575948bf0#diff-d5b32b873e9902b496280a5f42c246043c8f0691d8b3a6bbd56df99ce8ceb394L7190
Joe Damato Oct. 28, 2024, 3:50 p.m. UTC | #2
On Sun, Oct 27, 2024 at 11:49:33AM +0200, Lifshits, Vitaly wrote:
> 
> On 10/23/2024 12:52 AM, Joe Damato wrote:
> > Link queues to NAPI instances via netdev-genl API so that users can
> > query this information with netlink. Handle a few cases in the driver:
> >    1. Link/unlink the NAPIs when XDP is enabled/disabled
> >    2. Handle IGC_FLAG_QUEUE_PAIRS enabled and disabled
> > 
> > Example output when IGC_FLAG_QUEUE_PAIRS is enabled:
> > 
> > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> >                           --dump queue-get --json='{"ifindex": 2}'
> > 
> > [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
> >   {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'rx'},
> >   {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'rx'},
> >   {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
> >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'tx'},
> >   {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
> >   {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]
> > 
> > Since IGC_FLAG_QUEUE_PAIRS is enabled, you'll note that the same NAPI ID
> > is present for both rx and tx queues at the same index, for example
> > index 0:
> > 
> > {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> > {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
> > 
> > To test IGC_FLAG_QUEUE_PAIRS disabled, a test system was booted using
> > the grub command line option "maxcpus=2" to force
> > igc_set_interrupt_capability to disable IGC_FLAG_QUEUE_PAIRS.
> > 
> > Example output when IGC_FLAG_QUEUE_PAIRS is disabled:
> > 
> > $ lscpu | grep "On-line CPU"
> > On-line CPU(s) list:      0,2
> > 
> > $ ethtool -l enp86s0  | tail -5
> > Current hardware settings:
> > RX:		n/a
> > TX:		n/a
> > Other:		1
> > Combined:	2
> > 
> > $ cat /proc/interrupts  | grep enp
> >   144: [...] enp86s0
> >   145: [...] enp86s0-rx-0
> >   146: [...] enp86s0-rx-1
> >   147: [...] enp86s0-tx-0
> >   148: [...] enp86s0-tx-1
> > 
> > 1 "other" IRQ, and 2 IRQs for each of RX and Tx, so we expect netlink to
> > report 4 IRQs with unique NAPI IDs:
> > 
> > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> >                           --dump napi-get --json='{"ifindex": 2}'
> > [{'id': 8196, 'ifindex': 2, 'irq': 148},
> >   {'id': 8195, 'ifindex': 2, 'irq': 147},
> >   {'id': 8194, 'ifindex': 2, 'irq': 146},
> >   {'id': 8193, 'ifindex': 2, 'irq': 145}]
> > 
> > Now we examine which queues these NAPIs are associated with, expecting
> > that since IGC_FLAG_QUEUE_PAIRS is disabled each RX and TX queue will
> > have its own NAPI instance:
> > 
> > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> >                           --dump queue-get --json='{"ifindex": 2}'
> > [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
> >   {'id': 0, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
> >   {'id': 1, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]
> > 
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> > ---
> >   v4:
> >     - Add rtnl_lock/rtnl_unlock in two paths: igc_resume and
> >       igc_io_error_detected. The code added to the latter is inspired by
> >       a similar implementation in ixgbe's ixgbe_io_error_detected.
> > 
> >   v3:
> >     - Replace igc_unset_queue_napi with igc_set_queue_napi(adapater, i,
> >       NULL), as suggested by Vinicius Costa Gomes
> >     - Simplify implemention of igc_set_queue_napi as suggested by Kurt
> >       Kanzenbach, with a tweak to use ring->queue_index
> > 
> >   v2:
> >     - Update commit message to include tests for IGC_FLAG_QUEUE_PAIRS
> >       disabled
> >     - Refactored code to move napi queue mapping and unmapping to helper
> >       functions igc_set_queue_napi and igc_unset_queue_napi
> >     - Adjust the code to handle IGC_FLAG_QUEUE_PAIRS disabled
> >     - Call helpers to map/unmap queues to NAPIs in igc_up, __igc_open,
> >       igc_xdp_enable_pool, and igc_xdp_disable_pool
> > 
> >   drivers/net/ethernet/intel/igc/igc.h      |  2 ++
> >   drivers/net/ethernet/intel/igc/igc_main.c | 41 ++++++++++++++++++++---
> >   drivers/net/ethernet/intel/igc/igc_xdp.c  |  2 ++
> >   3 files changed, 40 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> > index eac0f966e0e4..b8111ad9a9a8 100644
> > --- a/drivers/net/ethernet/intel/igc/igc.h
> > +++ b/drivers/net/ethernet/intel/igc/igc.h
> > @@ -337,6 +337,8 @@ struct igc_adapter {
> >   	struct igc_led_classdev *leds;
> >   };
> > +void igc_set_queue_napi(struct igc_adapter *adapter, int q_idx,
> > +			struct napi_struct *napi);
> >   void igc_up(struct igc_adapter *adapter);
> >   void igc_down(struct igc_adapter *adapter);
> >   int igc_open(struct net_device *netdev);
> > diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> > index 7964bbedb16c..04aa216ef612 100644
> > --- a/drivers/net/ethernet/intel/igc/igc_main.c
> > +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> > @@ -4948,6 +4948,22 @@ static int igc_sw_init(struct igc_adapter *adapter)
> >   	return 0;
> >   }
> > +void igc_set_queue_napi(struct igc_adapter *adapter, int vector,
> > +			struct napi_struct *napi)
> > +{
> > +	struct igc_q_vector *q_vector = adapter->q_vector[vector];
> > +
> > +	if (q_vector->rx.ring)
> > +		netif_queue_set_napi(adapter->netdev,
> > +				     q_vector->rx.ring->queue_index,
> > +				     NETDEV_QUEUE_TYPE_RX, napi);
> > +
> > +	if (q_vector->tx.ring)
> > +		netif_queue_set_napi(adapter->netdev,
> > +				     q_vector->tx.ring->queue_index,
> > +				     NETDEV_QUEUE_TYPE_TX, napi);
> > +}
> > +
> >   /**
> >    * igc_up - Open the interface and prepare it to handle traffic
> >    * @adapter: board private structure
> > @@ -4955,6 +4971,7 @@ static int igc_sw_init(struct igc_adapter *adapter)
> >   void igc_up(struct igc_adapter *adapter)
> >   {
> >   	struct igc_hw *hw = &adapter->hw;
> > +	struct napi_struct *napi;
> >   	int i = 0;
> >   	/* hardware has been reset, we need to reload some things */
> > @@ -4962,8 +4979,11 @@ void igc_up(struct igc_adapter *adapter)
> >   	clear_bit(__IGC_DOWN, &adapter->state);
> > -	for (i = 0; i < adapter->num_q_vectors; i++)
> > -		napi_enable(&adapter->q_vector[i]->napi);
> > +	for (i = 0; i < adapter->num_q_vectors; i++) {
> > +		napi = &adapter->q_vector[i]->napi;
> > +		napi_enable(napi);
> > +		igc_set_queue_napi(adapter, i, napi);
> > +	}
> >   	if (adapter->msix_entries)
> >   		igc_configure_msix(adapter);
> > @@ -5192,6 +5212,7 @@ void igc_down(struct igc_adapter *adapter)
> >   	for (i = 0; i < adapter->num_q_vectors; i++) {
> >   		if (adapter->q_vector[i]) {
> >   			napi_synchronize(&adapter->q_vector[i]->napi);
> > +			igc_set_queue_napi(adapter, i, NULL);
> >   			napi_disable(&adapter->q_vector[i]->napi);
> >   		}
> >   	}
> > @@ -6021,6 +6042,7 @@ static int __igc_open(struct net_device *netdev, bool resuming)
> >   	struct igc_adapter *adapter = netdev_priv(netdev);
> >   	struct pci_dev *pdev = adapter->pdev;
> >   	struct igc_hw *hw = &adapter->hw;
> > +	struct napi_struct *napi;
> >   	int err = 0;
> >   	int i = 0;
> > @@ -6056,8 +6078,11 @@ static int __igc_open(struct net_device *netdev, bool resuming)
> >   	clear_bit(__IGC_DOWN, &adapter->state);
> > -	for (i = 0; i < adapter->num_q_vectors; i++)
> > -		napi_enable(&adapter->q_vector[i]->napi);
> > +	for (i = 0; i < adapter->num_q_vectors; i++) {
> > +		napi = &adapter->q_vector[i]->napi;
> > +		napi_enable(napi);
> > +		igc_set_queue_napi(adapter, i, napi);
> > +	}
> >   	/* Clear any pending interrupts. */
> >   	rd32(IGC_ICR);
> > @@ -7385,7 +7410,9 @@ static int igc_resume(struct device *dev)
> >   	wr32(IGC_WUS, ~0);
> >   	if (netif_running(netdev)) {
> > +		rtnl_lock();
> 
> This change will bring back the deadlock issue that was fixed in commit:
> 6f31d6b: "igc: Refactor runtime power management flow".

OK, thanks for letting me know.

I think I better understand what the issue is. It seems that:

- igc_resume can be called with rtnl held via ethtool (which I
  didn't know), which calls __igc_open
- __igc_open re-enables NAPIs and re-links queues to NAPI IDs (which
  requires rtnl)

so, it seems like the rtnl_lock() I've added to igc_resume is
unnecessary.

I suppose I don't know all of the paths where the pm functions can
be called -- are there others where RTNL is _not_ already held?

I looked at e1000e and it seems that driver does not re-enable NAPIs
in its resume path and thus does not suffer from the same issue as
igc.

So my questions are:

  1. Are there are other contexts where igc_resume is called where
     RTNL is not held?

  2. If the answer is that RTNL is always held when igc_resume is
     called, then I can send a v5 that removes the
     rtnl_lock/rtnl_unlock. What do you think?

[...]

> 
> Hi Joe,
> 
> 
> The current version will cause a regression, a possible deadlock, due to the
> addition of the rtnl_lock in igc_resume that was fixed previously.
> 
> You can refer to the following link:
> 
> https://github.com/torvalds/linux/commit/6f31d6b643a32cc126cf86093fca1ea575948bf0#diff-d5b32b873e9902b496280a5f42c246043c8f0691d8b3a6bbd56df99ce8ceb394L7190

Thanks for the link.
Joe Damato Oct. 28, 2024, 4 p.m. UTC | #3
On Mon, Oct 28, 2024 at 08:50:38AM -0700, Joe Damato wrote:
> On Sun, Oct 27, 2024 at 11:49:33AM +0200, Lifshits, Vitaly wrote:
> > 
> > On 10/23/2024 12:52 AM, Joe Damato wrote:
> > > Link queues to NAPI instances via netdev-genl API so that users can
> > > query this information with netlink. Handle a few cases in the driver:
> > >    1. Link/unlink the NAPIs when XDP is enabled/disabled
> > >    2. Handle IGC_FLAG_QUEUE_PAIRS enabled and disabled
> > > 
> > > Example output when IGC_FLAG_QUEUE_PAIRS is enabled:
> > > 
> > > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> > >                           --dump queue-get --json='{"ifindex": 2}'
> > > 
> > > [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> > >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
> > >   {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'rx'},
> > >   {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'rx'},
> > >   {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
> > >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'tx'},
> > >   {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
> > >   {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]
> > > 
> > > Since IGC_FLAG_QUEUE_PAIRS is enabled, you'll note that the same NAPI ID
> > > is present for both rx and tx queues at the same index, for example
> > > index 0:
> > > 
> > > {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> > > {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
> > > 
> > > To test IGC_FLAG_QUEUE_PAIRS disabled, a test system was booted using
> > > the grub command line option "maxcpus=2" to force
> > > igc_set_interrupt_capability to disable IGC_FLAG_QUEUE_PAIRS.
> > > 
> > > Example output when IGC_FLAG_QUEUE_PAIRS is disabled:
> > > 
> > > $ lscpu | grep "On-line CPU"
> > > On-line CPU(s) list:      0,2
> > > 
> > > $ ethtool -l enp86s0  | tail -5
> > > Current hardware settings:
> > > RX:		n/a
> > > TX:		n/a
> > > Other:		1
> > > Combined:	2
> > > 
> > > $ cat /proc/interrupts  | grep enp
> > >   144: [...] enp86s0
> > >   145: [...] enp86s0-rx-0
> > >   146: [...] enp86s0-rx-1
> > >   147: [...] enp86s0-tx-0
> > >   148: [...] enp86s0-tx-1
> > > 
> > > 1 "other" IRQ, and 2 IRQs for each of RX and Tx, so we expect netlink to
> > > report 4 IRQs with unique NAPI IDs:
> > > 
> > > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> > >                           --dump napi-get --json='{"ifindex": 2}'
> > > [{'id': 8196, 'ifindex': 2, 'irq': 148},
> > >   {'id': 8195, 'ifindex': 2, 'irq': 147},
> > >   {'id': 8194, 'ifindex': 2, 'irq': 146},
> > >   {'id': 8193, 'ifindex': 2, 'irq': 145}]
> > > 
> > > Now we examine which queues these NAPIs are associated with, expecting
> > > that since IGC_FLAG_QUEUE_PAIRS is disabled each RX and TX queue will
> > > have its own NAPI instance:
> > > 
> > > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> > >                           --dump queue-get --json='{"ifindex": 2}'
> > > [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> > >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
> > >   {'id': 0, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
> > >   {'id': 1, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]
> > > 
> > > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > > Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> > > ---
> > >   v4:
> > >     - Add rtnl_lock/rtnl_unlock in two paths: igc_resume and
> > >       igc_io_error_detected. The code added to the latter is inspired by
> > >       a similar implementation in ixgbe's ixgbe_io_error_detected.
> > > 
> > >   v3:
> > >     - Replace igc_unset_queue_napi with igc_set_queue_napi(adapater, i,
> > >       NULL), as suggested by Vinicius Costa Gomes
> > >     - Simplify implemention of igc_set_queue_napi as suggested by Kurt
> > >       Kanzenbach, with a tweak to use ring->queue_index
> > > 
> > >   v2:
> > >     - Update commit message to include tests for IGC_FLAG_QUEUE_PAIRS
> > >       disabled
> > >     - Refactored code to move napi queue mapping and unmapping to helper
> > >       functions igc_set_queue_napi and igc_unset_queue_napi
> > >     - Adjust the code to handle IGC_FLAG_QUEUE_PAIRS disabled
> > >     - Call helpers to map/unmap queues to NAPIs in igc_up, __igc_open,
> > >       igc_xdp_enable_pool, and igc_xdp_disable_pool
> > > 
> > >   drivers/net/ethernet/intel/igc/igc.h      |  2 ++
> > >   drivers/net/ethernet/intel/igc/igc_main.c | 41 ++++++++++++++++++++---
> > >   drivers/net/ethernet/intel/igc/igc_xdp.c  |  2 ++
> > >   3 files changed, 40 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> > > index eac0f966e0e4..b8111ad9a9a8 100644
> > > --- a/drivers/net/ethernet/intel/igc/igc.h
> > > +++ b/drivers/net/ethernet/intel/igc/igc.h
> > > @@ -337,6 +337,8 @@ struct igc_adapter {
> > >   	struct igc_led_classdev *leds;
> > >   };
> > > +void igc_set_queue_napi(struct igc_adapter *adapter, int q_idx,
> > > +			struct napi_struct *napi);
> > >   void igc_up(struct igc_adapter *adapter);
> > >   void igc_down(struct igc_adapter *adapter);
> > >   int igc_open(struct net_device *netdev);
> > > diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> > > index 7964bbedb16c..04aa216ef612 100644
> > > --- a/drivers/net/ethernet/intel/igc/igc_main.c
> > > +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> > > @@ -4948,6 +4948,22 @@ static int igc_sw_init(struct igc_adapter *adapter)
> > >   	return 0;
> > >   }
> > > +void igc_set_queue_napi(struct igc_adapter *adapter, int vector,
> > > +			struct napi_struct *napi)
> > > +{
> > > +	struct igc_q_vector *q_vector = adapter->q_vector[vector];
> > > +
> > > +	if (q_vector->rx.ring)
> > > +		netif_queue_set_napi(adapter->netdev,
> > > +				     q_vector->rx.ring->queue_index,
> > > +				     NETDEV_QUEUE_TYPE_RX, napi);
> > > +
> > > +	if (q_vector->tx.ring)
> > > +		netif_queue_set_napi(adapter->netdev,
> > > +				     q_vector->tx.ring->queue_index,
> > > +				     NETDEV_QUEUE_TYPE_TX, napi);
> > > +}
> > > +
> > >   /**
> > >    * igc_up - Open the interface and prepare it to handle traffic
> > >    * @adapter: board private structure
> > > @@ -4955,6 +4971,7 @@ static int igc_sw_init(struct igc_adapter *adapter)
> > >   void igc_up(struct igc_adapter *adapter)
> > >   {
> > >   	struct igc_hw *hw = &adapter->hw;
> > > +	struct napi_struct *napi;
> > >   	int i = 0;
> > >   	/* hardware has been reset, we need to reload some things */
> > > @@ -4962,8 +4979,11 @@ void igc_up(struct igc_adapter *adapter)
> > >   	clear_bit(__IGC_DOWN, &adapter->state);
> > > -	for (i = 0; i < adapter->num_q_vectors; i++)
> > > -		napi_enable(&adapter->q_vector[i]->napi);
> > > +	for (i = 0; i < adapter->num_q_vectors; i++) {
> > > +		napi = &adapter->q_vector[i]->napi;
> > > +		napi_enable(napi);
> > > +		igc_set_queue_napi(adapter, i, napi);
> > > +	}
> > >   	if (adapter->msix_entries)
> > >   		igc_configure_msix(adapter);
> > > @@ -5192,6 +5212,7 @@ void igc_down(struct igc_adapter *adapter)
> > >   	for (i = 0; i < adapter->num_q_vectors; i++) {
> > >   		if (adapter->q_vector[i]) {
> > >   			napi_synchronize(&adapter->q_vector[i]->napi);
> > > +			igc_set_queue_napi(adapter, i, NULL);
> > >   			napi_disable(&adapter->q_vector[i]->napi);
> > >   		}
> > >   	}
> > > @@ -6021,6 +6042,7 @@ static int __igc_open(struct net_device *netdev, bool resuming)
> > >   	struct igc_adapter *adapter = netdev_priv(netdev);
> > >   	struct pci_dev *pdev = adapter->pdev;
> > >   	struct igc_hw *hw = &adapter->hw;
> > > +	struct napi_struct *napi;
> > >   	int err = 0;
> > >   	int i = 0;
> > > @@ -6056,8 +6078,11 @@ static int __igc_open(struct net_device *netdev, bool resuming)
> > >   	clear_bit(__IGC_DOWN, &adapter->state);
> > > -	for (i = 0; i < adapter->num_q_vectors; i++)
> > > -		napi_enable(&adapter->q_vector[i]->napi);
> > > +	for (i = 0; i < adapter->num_q_vectors; i++) {
> > > +		napi = &adapter->q_vector[i]->napi;
> > > +		napi_enable(napi);
> > > +		igc_set_queue_napi(adapter, i, napi);
> > > +	}
> > >   	/* Clear any pending interrupts. */
> > >   	rd32(IGC_ICR);
> > > @@ -7385,7 +7410,9 @@ static int igc_resume(struct device *dev)
> > >   	wr32(IGC_WUS, ~0);
> > >   	if (netif_running(netdev)) {
> > > +		rtnl_lock();
> > 
> > This change will bring back the deadlock issue that was fixed in commit:
> > 6f31d6b: "igc: Refactor runtime power management flow".
> 
> OK, thanks for letting me know.
> 
> I think I better understand what the issue is. It seems that:
> 
> - igc_resume can be called with rtnl held via ethtool (which I
>   didn't know), which calls __igc_open
> - __igc_open re-enables NAPIs and re-links queues to NAPI IDs (which
>   requires rtnl)
> 
> so, it seems like the rtnl_lock() I've added to igc_resume is
> unnecessary.
> 
> I suppose I don't know all of the paths where the pm functions can
> be called -- are there others where RTNL is _not_ already held?
> 
> I looked at e1000e and it seems that driver does not re-enable NAPIs
> in its resume path and thus does not suffer from the same issue as
> igc.
> 
> So my questions are:
> 
>   1. Are there are other contexts where igc_resume is called where
>      RTNL is not held?
> 
>   2. If the answer is that RTNL is always held when igc_resume is
>      called, then I can send a v5 that removes the
>      rtnl_lock/rtnl_unlock. What do you think?

I see, so it looks like there is:
   - resume
   - runtime_resume

The bug I am reintroducing is runtime_resume already holding RTNL
before my added call to rtnl_lock.

OK.

Does resume also hold rtnl before the driver's igc_resume is called?
I am asking because I don't know much about how PM works.

If resume does not hold RTNL (but runtime resume does, as the bug
you pointed out shows), it seems like a wrapper can be added to tell
the code whether rtnl should be held or not based on which resume is
happening.

Does anyone know if: resume (not runtime_resume) already holds RTNL?
I'll try to take a look and see, but I am not very familiar with PM.
Joe Damato Oct. 28, 2024, 6:51 p.m. UTC | #4
On Mon, Oct 28, 2024 at 09:00:06AM -0700, Joe Damato wrote:
> On Mon, Oct 28, 2024 at 08:50:38AM -0700, Joe Damato wrote:
> > On Sun, Oct 27, 2024 at 11:49:33AM +0200, Lifshits, Vitaly wrote:
> > > 
> > > On 10/23/2024 12:52 AM, Joe Damato wrote:
> > > > Link queues to NAPI instances via netdev-genl API so that users can
> > > > query this information with netlink. Handle a few cases in the driver:
> > > >    1. Link/unlink the NAPIs when XDP is enabled/disabled
> > > >    2. Handle IGC_FLAG_QUEUE_PAIRS enabled and disabled
> > > > 
> > > > Example output when IGC_FLAG_QUEUE_PAIRS is enabled:
> > > > 
> > > > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> > > >                           --dump queue-get --json='{"ifindex": 2}'
> > > > 
> > > > [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> > > >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
> > > >   {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'rx'},
> > > >   {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'rx'},
> > > >   {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
> > > >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'tx'},
> > > >   {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
> > > >   {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]
> > > > 
> > > > Since IGC_FLAG_QUEUE_PAIRS is enabled, you'll note that the same NAPI ID
> > > > is present for both rx and tx queues at the same index, for example
> > > > index 0:
> > > > 
> > > > {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> > > > {'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'tx'},
> > > > 
> > > > To test IGC_FLAG_QUEUE_PAIRS disabled, a test system was booted using
> > > > the grub command line option "maxcpus=2" to force
> > > > igc_set_interrupt_capability to disable IGC_FLAG_QUEUE_PAIRS.
> > > > 
> > > > Example output when IGC_FLAG_QUEUE_PAIRS is disabled:
> > > > 
> > > > $ lscpu | grep "On-line CPU"
> > > > On-line CPU(s) list:      0,2
> > > > 
> > > > $ ethtool -l enp86s0  | tail -5
> > > > Current hardware settings:
> > > > RX:		n/a
> > > > TX:		n/a
> > > > Other:		1
> > > > Combined:	2
> > > > 
> > > > $ cat /proc/interrupts  | grep enp
> > > >   144: [...] enp86s0
> > > >   145: [...] enp86s0-rx-0
> > > >   146: [...] enp86s0-rx-1
> > > >   147: [...] enp86s0-tx-0
> > > >   148: [...] enp86s0-tx-1
> > > > 
> > > > 1 "other" IRQ, and 2 IRQs for each of RX and Tx, so we expect netlink to
> > > > report 4 IRQs with unique NAPI IDs:
> > > > 
> > > > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> > > >                           --dump napi-get --json='{"ifindex": 2}'
> > > > [{'id': 8196, 'ifindex': 2, 'irq': 148},
> > > >   {'id': 8195, 'ifindex': 2, 'irq': 147},
> > > >   {'id': 8194, 'ifindex': 2, 'irq': 146},
> > > >   {'id': 8193, 'ifindex': 2, 'irq': 145}]
> > > > 
> > > > Now we examine which queues these NAPIs are associated with, expecting
> > > > that since IGC_FLAG_QUEUE_PAIRS is disabled each RX and TX queue will
> > > > have its own NAPI instance:
> > > > 
> > > > $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \
> > > >                           --dump queue-get --json='{"ifindex": 2}'
> > > > [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'},
> > > >   {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'},
> > > >   {'id': 0, 'ifindex': 2, 'napi-id': 8195, 'type': 'tx'},
> > > >   {'id': 1, 'ifindex': 2, 'napi-id': 8196, 'type': 'tx'}]
> > > > 
> > > > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > > > Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> > > > ---
> > > >   v4:
> > > >     - Add rtnl_lock/rtnl_unlock in two paths: igc_resume and
> > > >       igc_io_error_detected. The code added to the latter is inspired by
> > > >       a similar implementation in ixgbe's ixgbe_io_error_detected.
> > > > 
> > > >   v3:
> > > >     - Replace igc_unset_queue_napi with igc_set_queue_napi(adapater, i,
> > > >       NULL), as suggested by Vinicius Costa Gomes
> > > >     - Simplify implemention of igc_set_queue_napi as suggested by Kurt
> > > >       Kanzenbach, with a tweak to use ring->queue_index
> > > > 
> > > >   v2:
> > > >     - Update commit message to include tests for IGC_FLAG_QUEUE_PAIRS
> > > >       disabled
> > > >     - Refactored code to move napi queue mapping and unmapping to helper
> > > >       functions igc_set_queue_napi and igc_unset_queue_napi
> > > >     - Adjust the code to handle IGC_FLAG_QUEUE_PAIRS disabled
> > > >     - Call helpers to map/unmap queues to NAPIs in igc_up, __igc_open,
> > > >       igc_xdp_enable_pool, and igc_xdp_disable_pool
> > > > 
> > > >   drivers/net/ethernet/intel/igc/igc.h      |  2 ++
> > > >   drivers/net/ethernet/intel/igc/igc_main.c | 41 ++++++++++++++++++++---
> > > >   drivers/net/ethernet/intel/igc/igc_xdp.c  |  2 ++
> > > >   3 files changed, 40 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> > > > index eac0f966e0e4..b8111ad9a9a8 100644
> > > > --- a/drivers/net/ethernet/intel/igc/igc.h
> > > > +++ b/drivers/net/ethernet/intel/igc/igc.h
> > > > @@ -337,6 +337,8 @@ struct igc_adapter {
> > > >   	struct igc_led_classdev *leds;
> > > >   };
> > > > +void igc_set_queue_napi(struct igc_adapter *adapter, int q_idx,
> > > > +			struct napi_struct *napi);
> > > >   void igc_up(struct igc_adapter *adapter);
> > > >   void igc_down(struct igc_adapter *adapter);
> > > >   int igc_open(struct net_device *netdev);
> > > > diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> > > > index 7964bbedb16c..04aa216ef612 100644
> > > > --- a/drivers/net/ethernet/intel/igc/igc_main.c
> > > > +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> > > > @@ -4948,6 +4948,22 @@ static int igc_sw_init(struct igc_adapter *adapter)
> > > >   	return 0;
> > > >   }
> > > > +void igc_set_queue_napi(struct igc_adapter *adapter, int vector,
> > > > +			struct napi_struct *napi)
> > > > +{
> > > > +	struct igc_q_vector *q_vector = adapter->q_vector[vector];
> > > > +
> > > > +	if (q_vector->rx.ring)
> > > > +		netif_queue_set_napi(adapter->netdev,
> > > > +				     q_vector->rx.ring->queue_index,
> > > > +				     NETDEV_QUEUE_TYPE_RX, napi);
> > > > +
> > > > +	if (q_vector->tx.ring)
> > > > +		netif_queue_set_napi(adapter->netdev,
> > > > +				     q_vector->tx.ring->queue_index,
> > > > +				     NETDEV_QUEUE_TYPE_TX, napi);
> > > > +}
> > > > +
> > > >   /**
> > > >    * igc_up - Open the interface and prepare it to handle traffic
> > > >    * @adapter: board private structure
> > > > @@ -4955,6 +4971,7 @@ static int igc_sw_init(struct igc_adapter *adapter)
> > > >   void igc_up(struct igc_adapter *adapter)
> > > >   {
> > > >   	struct igc_hw *hw = &adapter->hw;
> > > > +	struct napi_struct *napi;
> > > >   	int i = 0;
> > > >   	/* hardware has been reset, we need to reload some things */
> > > > @@ -4962,8 +4979,11 @@ void igc_up(struct igc_adapter *adapter)
> > > >   	clear_bit(__IGC_DOWN, &adapter->state);
> > > > -	for (i = 0; i < adapter->num_q_vectors; i++)
> > > > -		napi_enable(&adapter->q_vector[i]->napi);
> > > > +	for (i = 0; i < adapter->num_q_vectors; i++) {
> > > > +		napi = &adapter->q_vector[i]->napi;
> > > > +		napi_enable(napi);
> > > > +		igc_set_queue_napi(adapter, i, napi);
> > > > +	}
> > > >   	if (adapter->msix_entries)
> > > >   		igc_configure_msix(adapter);
> > > > @@ -5192,6 +5212,7 @@ void igc_down(struct igc_adapter *adapter)
> > > >   	for (i = 0; i < adapter->num_q_vectors; i++) {
> > > >   		if (adapter->q_vector[i]) {
> > > >   			napi_synchronize(&adapter->q_vector[i]->napi);
> > > > +			igc_set_queue_napi(adapter, i, NULL);
> > > >   			napi_disable(&adapter->q_vector[i]->napi);
> > > >   		}
> > > >   	}
> > > > @@ -6021,6 +6042,7 @@ static int __igc_open(struct net_device *netdev, bool resuming)
> > > >   	struct igc_adapter *adapter = netdev_priv(netdev);
> > > >   	struct pci_dev *pdev = adapter->pdev;
> > > >   	struct igc_hw *hw = &adapter->hw;
> > > > +	struct napi_struct *napi;
> > > >   	int err = 0;
> > > >   	int i = 0;
> > > > @@ -6056,8 +6078,11 @@ static int __igc_open(struct net_device *netdev, bool resuming)
> > > >   	clear_bit(__IGC_DOWN, &adapter->state);
> > > > -	for (i = 0; i < adapter->num_q_vectors; i++)
> > > > -		napi_enable(&adapter->q_vector[i]->napi);
> > > > +	for (i = 0; i < adapter->num_q_vectors; i++) {
> > > > +		napi = &adapter->q_vector[i]->napi;
> > > > +		napi_enable(napi);
> > > > +		igc_set_queue_napi(adapter, i, napi);
> > > > +	}
> > > >   	/* Clear any pending interrupts. */
> > > >   	rd32(IGC_ICR);
> > > > @@ -7385,7 +7410,9 @@ static int igc_resume(struct device *dev)
> > > >   	wr32(IGC_WUS, ~0);
> > > >   	if (netif_running(netdev)) {
> > > > +		rtnl_lock();
> > > 
> > > This change will bring back the deadlock issue that was fixed in commit:
> > > 6f31d6b: "igc: Refactor runtime power management flow".
> > 
> > OK, thanks for letting me know.
> > 
> > I think I better understand what the issue is. It seems that:
> > 
> > - igc_resume can be called with rtnl held via ethtool (which I
> >   didn't know), which calls __igc_open
> > - __igc_open re-enables NAPIs and re-links queues to NAPI IDs (which
> >   requires rtnl)
> > 
> > so, it seems like the rtnl_lock() I've added to igc_resume is
> > unnecessary.
> > 
> > I suppose I don't know all of the paths where the pm functions can
> > be called -- are there others where RTNL is _not_ already held?
> > 
> > I looked at e1000e and it seems that driver does not re-enable NAPIs
> > in its resume path and thus does not suffer from the same issue as
> > igc.
> > 
> > So my questions are:
> > 
> >   1. Are there are other contexts where igc_resume is called where
> >      RTNL is not held?
> > 
> >   2. If the answer is that RTNL is always held when igc_resume is
> >      called, then I can send a v5 that removes the
> >      rtnl_lock/rtnl_unlock. What do you think?
> 
> I see, so it looks like there is:
>    - resume
>    - runtime_resume
> 
> The bug I am reintroducing is runtime_resume already holding RTNL
> before my added call to rtnl_lock.
> 
> OK.
> 
> Does resume also hold rtnl before the driver's igc_resume is called?
> I am asking because I don't know much about how PM works.
> 
> If resume does not hold RTNL (but runtime resume does, as the bug
> you pointed out shows), it seems like a wrapper can be added to tell
> the code whether rtnl should be held or not based on which resume is
> happening.
> 
> Does anyone know if: resume (not runtime_resume) already holds RTNL?
> I'll try to take a look and see, but I am not very familiar with PM.

Well, I took a look and I'm probably wrong, but here's my
assessment:

- runtime_suspend can happen via ethtool or netlink when rtnl is
  held, so rtnl_lock will deadlock as pointed out above

- suspend happens via device_suspend in kernel/power/main.c, so I
  think taking rtnl is safe for "regular" suspend. Other drivers
  (like bnxt) seem to take rtnl in their "regular" suspend
  callbacks.

If the above assessment is correct, I think this change should fix
the issue Vitaly mentioned and I'll submit this as part of v5. It
adds a wrapper to tell igc_resume to either hold rtnl or not
depending on whether it's called from runtime_suspend or suspend.

I'll submit this as v5 shortly, and my apologies on my lack of
knowledge of PM; I am happy to perform any sort of testing on my igc
device you folks think would help verify that this is working
properly.

diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 04aa216ef612..051a0cdb1143 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -7367,7 +7367,7 @@ static void igc_deliver_wake_packet(struct net_device *netdev)
        netif_rx(skb);
 }

-static int igc_resume(struct device *dev)
+static int __igc_do_resume(struct device *dev, bool need_rtnl)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
        struct net_device *netdev = pci_get_drvdata(pdev);
@@ -7410,9 +7410,11 @@ static int igc_resume(struct device *dev)
        wr32(IGC_WUS, ~0);

        if (netif_running(netdev)) {
-               rtnl_lock();
+               if (need_rtnl)
+                       rtnl_lock();
                err = __igc_open(netdev, true);
-               rtnl_unlock();
+               if (need_rtnl)
+                       rtnl_unlock();
                if (!err)
                        netif_device_attach(netdev);
        }
@@ -7420,9 +7422,14 @@ static int igc_resume(struct device *dev)
        return err;
 }

+static int igc_resume(struct device *dev)
+{
+       return __igc_do_resume(dev, true);
+}
+
 static int igc_runtime_resume(struct device *dev)
 {
-       return igc_resume(dev);
+       return __igc_do_resume(dev, false);
 }

 static int igc_suspend(struct device *dev)
Jacob Keller Oct. 28, 2024, 6:53 p.m. UTC | #5
On 10/28/2024 9:00 AM, Joe Damato wrote:
> 
> I see, so it looks like there is:
>    - resume
>    - runtime_resume
> 
> The bug I am reintroducing is runtime_resume already holding RTNL
> before my added call to rtnl_lock.
> 
> OK.
> 
> Does resume also hold rtnl before the driver's igc_resume is called?
> I am asking because I don't know much about how PM works.
> 
> If resume does not hold RTNL (but runtime resume does, as the bug
> you pointed out shows), it seems like a wrapper can be added to tell
> the code whether rtnl should be held or not based on which resume is
> happening.
> 
> Does anyone know if: resume (not runtime_resume) already holds RTNL?
> I'll try to take a look and see, but I am not very familiar with PM.

I believe the resume doesn't hold RTNL, as its part of the core device
code, which is not networking specific. It shouldn't be acquiring RTNL
since that is a network specific lock.

I believe the code you posted as v5 should resolve this, and makes sense
to me.

Thanks for digging into this :)

-Jake
Joe Damato Oct. 28, 2024, 6:59 p.m. UTC | #6
On Mon, Oct 28, 2024 at 11:53:55AM -0700, Jacob Keller wrote:
> 
> 
> On 10/28/2024 9:00 AM, Joe Damato wrote:
> > 
> > I see, so it looks like there is:
> >    - resume
> >    - runtime_resume
> > 
> > The bug I am reintroducing is runtime_resume already holding RTNL
> > before my added call to rtnl_lock.
> > 
> > OK.
> > 
> > Does resume also hold rtnl before the driver's igc_resume is called?
> > I am asking because I don't know much about how PM works.
> > 
> > If resume does not hold RTNL (but runtime resume does, as the bug
> > you pointed out shows), it seems like a wrapper can be added to tell
> > the code whether rtnl should be held or not based on which resume is
> > happening.
> > 
> > Does anyone know if: resume (not runtime_resume) already holds RTNL?
> > I'll try to take a look and see, but I am not very familiar with PM.
> 
> I believe the resume doesn't hold RTNL, as its part of the core device
> code, which is not networking specific. It shouldn't be acquiring RTNL
> since that is a network specific lock.
> 
> I believe the code you posted as v5 should resolve this, and makes sense
> to me.
> 
> Thanks for digging into this :)

No problem; sorry for all the back and forth on this one and I
really appreciate your patience and reviews.

Thanks,
Joe
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index eac0f966e0e4..b8111ad9a9a8 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -337,6 +337,8 @@  struct igc_adapter {
 	struct igc_led_classdev *leds;
 };
 
+void igc_set_queue_napi(struct igc_adapter *adapter, int q_idx,
+			struct napi_struct *napi);
 void igc_up(struct igc_adapter *adapter);
 void igc_down(struct igc_adapter *adapter);
 int igc_open(struct net_device *netdev);
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 7964bbedb16c..04aa216ef612 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -4948,6 +4948,22 @@  static int igc_sw_init(struct igc_adapter *adapter)
 	return 0;
 }
 
+void igc_set_queue_napi(struct igc_adapter *adapter, int vector,
+			struct napi_struct *napi)
+{
+	struct igc_q_vector *q_vector = adapter->q_vector[vector];
+
+	if (q_vector->rx.ring)
+		netif_queue_set_napi(adapter->netdev,
+				     q_vector->rx.ring->queue_index,
+				     NETDEV_QUEUE_TYPE_RX, napi);
+
+	if (q_vector->tx.ring)
+		netif_queue_set_napi(adapter->netdev,
+				     q_vector->tx.ring->queue_index,
+				     NETDEV_QUEUE_TYPE_TX, napi);
+}
+
 /**
  * igc_up - Open the interface and prepare it to handle traffic
  * @adapter: board private structure
@@ -4955,6 +4971,7 @@  static int igc_sw_init(struct igc_adapter *adapter)
 void igc_up(struct igc_adapter *adapter)
 {
 	struct igc_hw *hw = &adapter->hw;
+	struct napi_struct *napi;
 	int i = 0;
 
 	/* hardware has been reset, we need to reload some things */
@@ -4962,8 +4979,11 @@  void igc_up(struct igc_adapter *adapter)
 
 	clear_bit(__IGC_DOWN, &adapter->state);
 
-	for (i = 0; i < adapter->num_q_vectors; i++)
-		napi_enable(&adapter->q_vector[i]->napi);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		napi = &adapter->q_vector[i]->napi;
+		napi_enable(napi);
+		igc_set_queue_napi(adapter, i, napi);
+	}
 
 	if (adapter->msix_entries)
 		igc_configure_msix(adapter);
@@ -5192,6 +5212,7 @@  void igc_down(struct igc_adapter *adapter)
 	for (i = 0; i < adapter->num_q_vectors; i++) {
 		if (adapter->q_vector[i]) {
 			napi_synchronize(&adapter->q_vector[i]->napi);
+			igc_set_queue_napi(adapter, i, NULL);
 			napi_disable(&adapter->q_vector[i]->napi);
 		}
 	}
@@ -6021,6 +6042,7 @@  static int __igc_open(struct net_device *netdev, bool resuming)
 	struct igc_adapter *adapter = netdev_priv(netdev);
 	struct pci_dev *pdev = adapter->pdev;
 	struct igc_hw *hw = &adapter->hw;
+	struct napi_struct *napi;
 	int err = 0;
 	int i = 0;
 
@@ -6056,8 +6078,11 @@  static int __igc_open(struct net_device *netdev, bool resuming)
 
 	clear_bit(__IGC_DOWN, &adapter->state);
 
-	for (i = 0; i < adapter->num_q_vectors; i++)
-		napi_enable(&adapter->q_vector[i]->napi);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		napi = &adapter->q_vector[i]->napi;
+		napi_enable(napi);
+		igc_set_queue_napi(adapter, i, napi);
+	}
 
 	/* Clear any pending interrupts. */
 	rd32(IGC_ICR);
@@ -7385,7 +7410,9 @@  static int igc_resume(struct device *dev)
 	wr32(IGC_WUS, ~0);
 
 	if (netif_running(netdev)) {
+		rtnl_lock();
 		err = __igc_open(netdev, true);
+		rtnl_unlock();
 		if (!err)
 			netif_device_attach(netdev);
 	}
@@ -7440,14 +7467,18 @@  static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev,
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct igc_adapter *adapter = netdev_priv(netdev);
 
+	rtnl_lock();
 	netif_device_detach(netdev);
 
-	if (state == pci_channel_io_perm_failure)
+	if (state == pci_channel_io_perm_failure) {
+		rtnl_unlock();
 		return PCI_ERS_RESULT_DISCONNECT;
+	}
 
 	if (netif_running(netdev))
 		igc_down(adapter);
 	pci_disable_device(pdev);
+	rtnl_unlock();
 
 	/* Request a slot reset. */
 	return PCI_ERS_RESULT_NEED_RESET;
diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c b/drivers/net/ethernet/intel/igc/igc_xdp.c
index e27af72aada8..4da633430b80 100644
--- a/drivers/net/ethernet/intel/igc/igc_xdp.c
+++ b/drivers/net/ethernet/intel/igc/igc_xdp.c
@@ -84,6 +84,7 @@  static int igc_xdp_enable_pool(struct igc_adapter *adapter,
 		napi_disable(napi);
 	}
 
+	igc_set_queue_napi(adapter, queue_id, NULL);
 	set_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
 	set_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
 
@@ -133,6 +134,7 @@  static int igc_xdp_disable_pool(struct igc_adapter *adapter, u16 queue_id)
 	xsk_pool_dma_unmap(pool, IGC_RX_DMA_ATTR);
 	clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
 	clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
+	igc_set_queue_napi(adapter, queue_id, napi);
 
 	if (needs_reset) {
 		napi_enable(napi);