diff mbox

[RFC] Regression in linux 2.6.32 virtio_net seen with vhost-net

Message ID 200912171232.26743.rusty@rustcorp.com.au
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Rusty Russell Dec. 17, 2009, 2:02 a.m. UTC
On Wed, 16 Dec 2009 11:52:18 pm Michael S. Tsirkin wrote:
> On Wed, Dec 16, 2009 at 11:15:38PM +1030, Rusty Russell wrote:
> > +	struct virtnet_info *vi =
> > +		container_of(xmit_napi, struct virtnet_info, xmit_napi);
> > +
> > +	if (netif_queue_stopped(vi->dev)) {
> 
> I am a bit concerned here: for example, on link down
> you do netif_stop_queue, and start on link up.
> So is it enough to check netif_queue_stopped
> to verify that tx is not running and that this is because
> it was out of capacity?
> 
> It would be very bad if this run in parallel with TX ...

Yeah, I wasn't happy.  This version uses the tx lock (we're single-queued,
so I used the __ version)

virtio_net: use NAPI for xmit (UNTESTED)

This is closer to the way tg3 and ixgbe do it: use the NAPI framework to
free transmitted packets.  It neatens things a little as well.

Changes since last version:

1) Use the tx lock for the xmit_poll to synchronize against
   start_xmit; it might be overkill, but it's simple.
2) Don't wake queue if the carrier is gone.

(Note: a side effect of this is that we are lazier in freeing old xmit skbs.
 This might be a slight win).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin Dec. 17, 2009, 9:25 a.m. UTC | #1
On Thu, Dec 17, 2009 at 12:32:26PM +1030, Rusty Russell wrote:
> On Wed, 16 Dec 2009 11:52:18 pm Michael S. Tsirkin wrote:
> > On Wed, Dec 16, 2009 at 11:15:38PM +1030, Rusty Russell wrote:
> > > +	struct virtnet_info *vi =
> > > +		container_of(xmit_napi, struct virtnet_info, xmit_napi);
> > > +
> > > +	if (netif_queue_stopped(vi->dev)) {
> > 
> > I am a bit concerned here: for example, on link down
> > you do netif_stop_queue, and start on link up.
> > So is it enough to check netif_queue_stopped
> > to verify that tx is not running and that this is because
> > it was out of capacity?
> > 
> > It would be very bad if this run in parallel with TX ...
> 
> Yeah, I wasn't happy.  This version uses the tx lock (we're single-queued,
> so I used the __ version)
> 
> virtio_net: use NAPI for xmit (UNTESTED)
> 
> This is closer to the way tg3 and ixgbe do it: use the NAPI framework to
> free transmitted packets.  It neatens things a little as well.
> 
> Changes since last version:
> 
> 1) Use the tx lock for the xmit_poll to synchronize against
>    start_xmit; it might be overkill, but it's simple.
> 2) Don't wake queue if the carrier is gone.
> 
> (Note: a side effect of this is that we are lazier in freeing old xmit skbs.
>  This might be a slight win).
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

That's very clean. Some questions below:

> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -47,6 +47,9 @@ struct virtnet_info
>  	struct napi_struct napi;
>  	unsigned int status;
>  
> +	/* We free packets and decide whether to restart xmit here. */
> +	struct napi_struct xmit_napi;
> +
>  	/* Number of input buffers, and max we've ever had. */
>  	unsigned int num, max;
>  
> @@ -60,6 +63,9 @@ struct virtnet_info
>  	struct sk_buff_head recv;
>  	struct sk_buff_head send;
>  
> +	/* Capacity left in xmit queue. */
> +	unsigned int capacity;
> +
>  	/* Work struct for refilling if we run low on memory. */
>  	struct delayed_work refill;
>  
> @@ -111,11 +117,8 @@ static void skb_xmit_done(struct virtque
>  {
>  	struct virtnet_info *vi = svq->vdev->priv;
>  
> -	/* Suppress further interrupts. */
> -	svq->vq_ops->disable_cb(svq);
> -
>  	/* We were probably waiting for more output buffers. */
> -	netif_wake_queue(vi->dev);
> +	napi_schedule(&vi->xmit_napi);
>  }
>  
>  static void receive_skb(struct net_device *dev, struct sk_buff *skb,
> @@ -455,6 +458,29 @@ static unsigned int free_old_xmit_skbs(s
>  	return tot_sgs;
>  }
>  
> +static int virtnet_xmit_poll(struct napi_struct *xmit_napi, int budget)
> +{
> +	struct virtnet_info *vi =
> +		container_of(xmit_napi, struct virtnet_info, xmit_napi);
> +
> +	/* Don't access vq/capacity at same time as start_xmit. */
> +	__netif_tx_lock(netdev_get_tx_queue(vi->dev, 0), smp_processor_id());

So now that we are locking, we could build a variant of this
without NAPI (maybe with trylock: we can't spin on xmit lock from
from hard irq context, can we?)? Possibly, if we do, that would be
a small enough change to be applicable in 2.6.32.

> +
> +	vi->capacity += free_old_xmit_skbs(vi);

Should we build a variant of free_old_xmit_skbs
that gets budget, to avoid starving others
while we poll the vq?

> +	if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
> +		/* Suppress further xmit interrupts. */
> +		vi->svq->vq_ops->disable_cb(vi->svq);
> +		napi_complete(xmit_napi);
> +
> +		/* Don't wake it if link is down. */
> +		if (likely(netif_carrier_ok(vi->vdev)))
> +			netif_wake_queue(vi->dev);
> +	}
> +
> +	__netif_tx_unlock(netdev_get_tx_queue(vi->dev, 0));
> +	return 1;
> +}
> +
>  static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
>  {
>  	struct scatterlist sg[2+MAX_SKB_FRAGS];
> @@ -509,10 +535,6 @@ static netdev_tx_t start_xmit(struct sk_
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	int capacity;
>  
> -again:
> -	/* Free up any pending old buffers before queueing new ones. */
> -	free_old_xmit_skbs(vi);
> -
>  	/* Try to transmit */
>  	capacity = xmit_skb(vi, skb);
>  
> @@ -520,14 +542,13 @@ again:
>  	if (unlikely(capacity < 0)) {
>  		netif_stop_queue(dev);
>  		dev_warn(&dev->dev, "Unexpected full queue\n");
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			vi->svq->vq_ops->disable_cb(vi->svq);
> -			netif_start_queue(dev);
> -			goto again;
> -		}
> +		/* If we missed an interrupt, we let virtnet_xmit_poll deal. */
> +		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
> +			napi_schedule(&vi->xmit_napi);
>  		return NETDEV_TX_BUSY;
>  	}
>  	vi->svq->vq_ops->kick(vi->svq);
> +	vi->capacity = capacity;
>  
>  	/*
>  	 * Put new one in send queue.  You'd expect we'd need this before
> @@ -545,14 +566,13 @@ again:
>  	/* Apparently nice girls don't return TX_BUSY; stop the queue
>  	 * before it gets out of hand.  Naturally, this wastes entries. */
>  	if (capacity < 2+MAX_SKB_FRAGS) {
> -		netif_stop_queue(dev);
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			/* More just got used, free them then recheck. */
> -			capacity += free_old_xmit_skbs(vi);
> -			if (capacity >= 2+MAX_SKB_FRAGS) {
> -				netif_start_queue(dev);
> -				vi->svq->vq_ops->disable_cb(vi->svq);
> -			}
> +		/* Free old skbs; might make more capacity. */
> +		vi->capacity = capacity + free_old_xmit_skbs(vi);
> +		if (unlikely(vi->capacity < 2+MAX_SKB_FRAGS)) {
> +			netif_stop_queue(dev);
> +			/* Missed xmit irq? virtnet_xmit_poll will deal. */
> +			if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
> +				napi_schedule(&vi->xmit_napi);
>  		}
>  	}
>  
> @@ -590,6 +610,7 @@ static int virtnet_open(struct net_devic
>  	struct virtnet_info *vi = netdev_priv(dev);
>  
>  	napi_enable(&vi->napi);
> +	napi_enable(&vi->xmit_napi);
>  
>  	/* If all buffers were filled by other side before we napi_enabled, we
>  	 * won't get another interrupt, so process any outstanding packets
> @@ -652,6 +673,7 @@ static int virtnet_close(struct net_devi
>  	struct virtnet_info *vi = netdev_priv(dev);
>  
>  	napi_disable(&vi->napi);
> +	napi_disable(&vi->xmit_napi);
>  
>  	return 0;
>  }
> @@ -818,9 +840,13 @@ static void virtnet_update_status(struct
>  
>  	if (vi->status & VIRTIO_NET_S_LINK_UP) {
>  		netif_carrier_on(vi->dev);
> -		netif_wake_queue(vi->dev);
> +		/* Make sure virtnet_xmit_poll sees carrier enabled. */
> +		wmb();

I think this should be smp_wmb, we are not synchronising with hardware
here. Right?

> +		napi_schedule(&vi->xmit_napi);
>  	} else {
>  		netif_carrier_off(vi->dev);
> +		/* Make sure virtnet_xmit_poll sees carrier disabled. */
> +		wmb();

And here.

>  		netif_stop_queue(vi->dev);
>  	}
>  }
> @@ -883,6 +909,7 @@ static int virtnet_probe(struct virtio_d
>  	/* Set up our device-specific information */
>  	vi = netdev_priv(dev);
>  	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
> +	netif_napi_add(dev, &vi->xmit_napi, virtnet_xmit_poll, 64);
>  	vi->dev = dev;
>  	vi->vdev = vdev;
>  	vdev->priv = vi;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell Dec. 18, 2009, 1:55 a.m. UTC | #2
On Thu, 17 Dec 2009 07:55:31 pm Michael S. Tsirkin wrote:
> > +static int virtnet_xmit_poll(struct napi_struct *xmit_napi, int budget)
> > +{
> > +	struct virtnet_info *vi =
> > +		container_of(xmit_napi, struct virtnet_info, xmit_napi);
> > +
> > +	/* Don't access vq/capacity at same time as start_xmit. */
> > +	__netif_tx_lock(netdev_get_tx_queue(vi->dev, 0), smp_processor_id());
> 
> So now that we are locking, we could build a variant of this
> without NAPI (maybe with trylock: we can't spin on xmit lock from
> from hard irq context, can we?)? Possibly, if we do, that would be
> a small enough change to be applicable in 2.6.32.

We'd need a separate lock and irq disable; it's not obvious to me that
trylock would never cause us to miss a required wakeup.

I'd rather get this right and backport, than introduce YA random change.
The NAPI change is no more complex that this.

From what Sridhar has said, this doesn't even fix the issue.  I'm confused...

Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -47,6 +47,9 @@  struct virtnet_info
 	struct napi_struct napi;
 	unsigned int status;
 
+	/* We free packets and decide whether to restart xmit here. */
+	struct napi_struct xmit_napi;
+
 	/* Number of input buffers, and max we've ever had. */
 	unsigned int num, max;
 
@@ -60,6 +63,9 @@  struct virtnet_info
 	struct sk_buff_head recv;
 	struct sk_buff_head send;
 
+	/* Capacity left in xmit queue. */
+	unsigned int capacity;
+
 	/* Work struct for refilling if we run low on memory. */
 	struct delayed_work refill;
 
@@ -111,11 +117,8 @@  static void skb_xmit_done(struct virtque
 {
 	struct virtnet_info *vi = svq->vdev->priv;
 
-	/* Suppress further interrupts. */
-	svq->vq_ops->disable_cb(svq);
-
 	/* We were probably waiting for more output buffers. */
-	netif_wake_queue(vi->dev);
+	napi_schedule(&vi->xmit_napi);
 }
 
 static void receive_skb(struct net_device *dev, struct sk_buff *skb,
@@ -455,6 +458,29 @@  static unsigned int free_old_xmit_skbs(s
 	return tot_sgs;
 }
 
+static int virtnet_xmit_poll(struct napi_struct *xmit_napi, int budget)
+{
+	struct virtnet_info *vi =
+		container_of(xmit_napi, struct virtnet_info, xmit_napi);
+
+	/* Don't access vq/capacity at same time as start_xmit. */
+	__netif_tx_lock(netdev_get_tx_queue(vi->dev, 0), smp_processor_id());
+
+	vi->capacity += free_old_xmit_skbs(vi);
+	if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
+		/* Suppress further xmit interrupts. */
+		vi->svq->vq_ops->disable_cb(vi->svq);
+		napi_complete(xmit_napi);
+
+		/* Don't wake it if link is down. */
+		if (likely(netif_carrier_ok(vi->vdev)))
+			netif_wake_queue(vi->dev);
+	}
+
+	__netif_tx_unlock(netdev_get_tx_queue(vi->dev, 0));
+	return 1;
+}
+
 static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 {
 	struct scatterlist sg[2+MAX_SKB_FRAGS];
@@ -509,10 +535,6 @@  static netdev_tx_t start_xmit(struct sk_
 	struct virtnet_info *vi = netdev_priv(dev);
 	int capacity;
 
-again:
-	/* Free up any pending old buffers before queueing new ones. */
-	free_old_xmit_skbs(vi);
-
 	/* Try to transmit */
 	capacity = xmit_skb(vi, skb);
 
@@ -520,14 +542,13 @@  again:
 	if (unlikely(capacity < 0)) {
 		netif_stop_queue(dev);
 		dev_warn(&dev->dev, "Unexpected full queue\n");
-		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
-			vi->svq->vq_ops->disable_cb(vi->svq);
-			netif_start_queue(dev);
-			goto again;
-		}
+		/* If we missed an interrupt, we let virtnet_xmit_poll deal. */
+		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
+			napi_schedule(&vi->xmit_napi);
 		return NETDEV_TX_BUSY;
 	}
 	vi->svq->vq_ops->kick(vi->svq);
+	vi->capacity = capacity;
 
 	/*
 	 * Put new one in send queue.  You'd expect we'd need this before
@@ -545,14 +566,13 @@  again:
 	/* Apparently nice girls don't return TX_BUSY; stop the queue
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
-		netif_stop_queue(dev);
-		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
-			/* More just got used, free them then recheck. */
-			capacity += free_old_xmit_skbs(vi);
-			if (capacity >= 2+MAX_SKB_FRAGS) {
-				netif_start_queue(dev);
-				vi->svq->vq_ops->disable_cb(vi->svq);
-			}
+		/* Free old skbs; might make more capacity. */
+		vi->capacity = capacity + free_old_xmit_skbs(vi);
+		if (unlikely(vi->capacity < 2+MAX_SKB_FRAGS)) {
+			netif_stop_queue(dev);
+			/* Missed xmit irq? virtnet_xmit_poll will deal. */
+			if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
+				napi_schedule(&vi->xmit_napi);
 		}
 	}
 
@@ -590,6 +610,7 @@  static int virtnet_open(struct net_devic
 	struct virtnet_info *vi = netdev_priv(dev);
 
 	napi_enable(&vi->napi);
+	napi_enable(&vi->xmit_napi);
 
 	/* If all buffers were filled by other side before we napi_enabled, we
 	 * won't get another interrupt, so process any outstanding packets
@@ -652,6 +673,7 @@  static int virtnet_close(struct net_devi
 	struct virtnet_info *vi = netdev_priv(dev);
 
 	napi_disable(&vi->napi);
+	napi_disable(&vi->xmit_napi);
 
 	return 0;
 }
@@ -818,9 +840,13 @@  static void virtnet_update_status(struct
 
 	if (vi->status & VIRTIO_NET_S_LINK_UP) {
 		netif_carrier_on(vi->dev);
-		netif_wake_queue(vi->dev);
+		/* Make sure virtnet_xmit_poll sees carrier enabled. */
+		wmb();
+		napi_schedule(&vi->xmit_napi);
 	} else {
 		netif_carrier_off(vi->dev);
+		/* Make sure virtnet_xmit_poll sees carrier disabled. */
+		wmb();
 		netif_stop_queue(vi->dev);
 	}
 }
@@ -883,6 +909,7 @@  static int virtnet_probe(struct virtio_d
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
 	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
+	netif_napi_add(dev, &vi->xmit_napi, virtnet_xmit_poll, 64);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;