diff mbox

[RFC] Regression in linux 2.6.32 virtio_net seen with vhost-net

Message ID 200912162315.38802.rusty@rustcorp.com.au
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Rusty Russell Dec. 16, 2009, 12:45 p.m. UTC
On Wed, 16 Dec 2009 01:23:31 pm Herbert Xu wrote:
> On Wed, Dec 16, 2009 at 01:11:40PM +1030, Rusty Russell wrote:
> > 
> >    Thanks for the hint.  They seem to use NAPI for xmit cleanup, so that's
> > what we should do?  I'll try, but such a rewrite doesn't belong in 2.6.32.
> 
> Well it depends.  Real drivers can't touch the hardware so they're
> stuck with whatever the hardware does.  For virtio we do have the
> flexibility of modifying the backend.
> 
> Having said that, for existing backends that will signal when there
> is just a single free entry on the queue something like NAPI could
> reduce the overhead associated with the IRQs.

OK, this is unfortunately untested, but wanted to send it out tonight:

virtio_net: use NAPI for xmit (UNTESTED)

This is closer to the way tg3 and ixgbe do it: use the NAPI framework to
free transmitted packets.  It neatens things a little as well.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin Dec. 16, 2009, 1:22 p.m. UTC | #1
On Wed, Dec 16, 2009 at 11:15:38PM +1030, Rusty Russell wrote:
> On Wed, 16 Dec 2009 01:23:31 pm Herbert Xu wrote:
> > On Wed, Dec 16, 2009 at 01:11:40PM +1030, Rusty Russell wrote:
> > > 
> > >    Thanks for the hint.  They seem to use NAPI for xmit cleanup, so that's
> > > what we should do?  I'll try, but such a rewrite doesn't belong in 2.6.32.
> > 
> > Well it depends.  Real drivers can't touch the hardware so they're
> > stuck with whatever the hardware does.  For virtio we do have the
> > flexibility of modifying the backend.
> > 
> > Having said that, for existing backends that will signal when there
> > is just a single free entry on the queue something like NAPI could
> > reduce the overhead associated with the IRQs.
> 
> OK, this is unfortunately untested, but wanted to send it out tonight:
> 
> virtio_net: use NAPI for xmit (UNTESTED)
> 
> This is closer to the way tg3 and ixgbe do it: use the NAPI framework to
> free transmitted packets.  It neatens things a little as well.

Looks very neat.

> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -47,6 +47,9 @@ struct virtnet_info
>  	struct napi_struct napi;
>  	unsigned int status;
>  
> +	/* We free packets and decide whether to restart xmit here. */
> +	struct napi_struct xmit_napi;
> +
>  	/* Number of input buffers, and max we've ever had. */
>  	unsigned int num, max;
>  
> @@ -60,6 +63,9 @@ struct virtnet_info
>  	struct sk_buff_head recv;
>  	struct sk_buff_head send;
>  
> +	/* Capacity left in xmit queue. */
> +	unsigned int capacity;
> +
>  	/* Work struct for refilling if we run low on memory. */
>  	struct delayed_work refill;
>  
> @@ -111,11 +117,8 @@ static void skb_xmit_done(struct virtque
>  {
>  	struct virtnet_info *vi = svq->vdev->priv;
>  
> -	/* Suppress further interrupts. */
> -	svq->vq_ops->disable_cb(svq);
> -
>  	/* We were probably waiting for more output buffers. */
> -	netif_wake_queue(vi->dev);
> +	napi_schedule(&vi->xmit_napi);
>  }
>  
>  static void receive_skb(struct net_device *dev, struct sk_buff *skb,
> @@ -446,7 +449,7 @@ static unsigned int free_old_xmit_skbs(s
>  
>  	while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) {
>  		pr_debug("Sent skb %p\n", skb);
> -		__skb_unlink(skb, &vi->send);
> +		skb_unlink(skb, &vi->send);
>  		vi->dev->stats.tx_bytes += skb->len;
>  		vi->dev->stats.tx_packets++;
>  		tot_sgs += skb_vnet_hdr(skb)->num_sg;
> @@ -455,6 +458,23 @@ static unsigned int free_old_xmit_skbs(s
>  	return tot_sgs;
>  }
>  
> +static int virtnet_xmit_poll(struct napi_struct *xmit_napi, int budget)
> +{
> +	struct virtnet_info *vi =
> +		container_of(xmit_napi, struct virtnet_info, xmit_napi);
> +
> +	if (netif_queue_stopped(vi->dev)) {

I am a bit concerned here: for example, on link down
you do netif_stop_queue, and start on link up.
So is it enough to check netif_queue_stopped
to verify that tx is not running and that this is because
it was out of capacity?

It would be very bad if this run in parallel with TX ...

If this is all safe, maybe add a comment explaining why ...

> +		vi->capacity += free_old_xmit_skbs(vi);
> +		if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
> +			/* Suppress further xmit interrupts. */
> +			vi->svq->vq_ops->disable_cb(vi->svq);
> +			napi_complete(xmit_napi);
> +			netif_wake_queue(vi->dev);
> +		}
> +	}
> +	return 1;
> +}
> +

One concern here: 
we are ignoring budget, is this an issue?


>  static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
>  {
>  	struct scatterlist sg[2+MAX_SKB_FRAGS];
> @@ -509,34 +529,22 @@ static netdev_tx_t start_xmit(struct sk_
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	int capacity;
>  
> -again:
> -	/* Free up any pending old buffers before queueing new ones. */
> -	free_old_xmit_skbs(vi);
> -
>  	/* Try to transmit */
> +	skb_queue_head(&vi->send, skb);
>  	capacity = xmit_skb(vi, skb);
>  
>  	/* This can happen with OOM and indirect buffers. */
>  	if (unlikely(capacity < 0)) {
> +		skb_unlink(skb, &vi->send);
>  		netif_stop_queue(dev);
>  		dev_warn(&dev->dev, "Unexpected full queue\n");
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			vi->svq->vq_ops->disable_cb(vi->svq);
> -			netif_start_queue(dev);
> -			goto again;
> -		}
> +		/* If we missed an interrupt, we let virtnet_xmit_poll deal. */
> +		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
> +			napi_schedule(&vi->xmit_napi);
>  		return NETDEV_TX_BUSY;
>  	}
>  	vi->svq->vq_ops->kick(vi->svq);
> -
> -	/*
> -	 * Put new one in send queue.  You'd expect we'd need this before
> -	 * xmit_skb calls add_buf(), since the callback can be triggered
> -	 * immediately after that.  But since the callback just triggers
> -	 * another call back here, normal network xmit locking prevents the
> -	 * race.
> -	 */
> -	__skb_queue_head(&vi->send, skb);
> +	vi->capacity = capacity;
>  
>  	/* Don't wait up for transmitted skbs to be freed. */
>  	skb_orphan(skb);
> @@ -544,15 +552,16 @@ again:
>  
>  	/* Apparently nice girls don't return TX_BUSY; stop the queue
>  	 * before it gets out of hand.  Naturally, this wastes entries. */
> -	if (capacity < 2+MAX_SKB_FRAGS) {
> -		netif_stop_queue(dev);
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			/* More just got used, free them then recheck. */
> -			capacity += free_old_xmit_skbs(vi);
> -			if (capacity >= 2+MAX_SKB_FRAGS) {
> -				netif_start_queue(dev);
> -				vi->svq->vq_ops->disable_cb(vi->svq);
> -			}
> +	if (unlikely(capacity < 2+MAX_SKB_FRAGS)) {
> +		/* Free old skbs; might make more capacity. */
> +		vi->capacity = capacity + free_old_xmit_skbs(vi);
> +		if (unlikely(vi->capacity < 2+MAX_SKB_FRAGS)) {
> +			/* Make sure virtnet_xmit_poll sees updated capacity */
> +			wmb();
> +			netif_stop_queue(dev);

Doesn't netif_stop_queue include an atomic op?
If yes, does not atomic imply wmb already?

> +			/* Missed xmit irq? virtnet_xmit_poll will deal. */
> +			if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
> +				napi_schedule(&vi->xmit_napi);
>  		}
>  	}
>  
> @@ -590,6 +599,7 @@ static int virtnet_open(struct net_devic
>  	struct virtnet_info *vi = netdev_priv(dev);
>  
>  	napi_enable(&vi->napi);
> +	napi_enable(&vi->xmit_napi);
>  
>  	/* If all buffers were filled by other side before we napi_enabled, we
>  	 * won't get another interrupt, so process any outstanding packets
> @@ -652,6 +662,7 @@ static int virtnet_close(struct net_devi
>  	struct virtnet_info *vi = netdev_priv(dev);
>  
>  	napi_disable(&vi->napi);
> +	napi_disable(&vi->xmit_napi);
>  
>  	return 0;
>  }
> @@ -883,6 +894,7 @@ static int virtnet_probe(struct virtio_d
>  	/* Set up our device-specific information */
>  	vi = netdev_priv(dev);
>  	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
> +	netif_napi_add(dev, &vi->xmit_napi, virtnet_xmit_poll, 64);
>  	vi->dev = dev;
>  	vi->vdev = vdev;
>  	vdev->priv = vi;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 16, 2009, 1:30 p.m. UTC | #2
On Wed, Dec 16, 2009 at 11:15:38PM +1030, Rusty Russell wrote:
> 
> virtio_net: use NAPI for xmit (UNTESTED)
> 
> This is closer to the way tg3 and ixgbe do it: use the NAPI framework to
> free transmitted packets.  It neatens things a little as well.

Looks good to me.

> @@ -544,15 +552,16 @@ again:
>  
>  	/* Apparently nice girls don't return TX_BUSY; stop the queue
>  	 * before it gets out of hand.  Naturally, this wastes entries. */
> -	if (capacity < 2+MAX_SKB_FRAGS) {
> -		netif_stop_queue(dev);
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			/* More just got used, free them then recheck. */
> -			capacity += free_old_xmit_skbs(vi);
> -			if (capacity >= 2+MAX_SKB_FRAGS) {
> -				netif_start_queue(dev);
> -				vi->svq->vq_ops->disable_cb(vi->svq);
> -			}
> +	if (unlikely(capacity < 2+MAX_SKB_FRAGS)) {
> +		/* Free old skbs; might make more capacity. */
> +		vi->capacity = capacity + free_old_xmit_skbs(vi);
> +		if (unlikely(vi->capacity < 2+MAX_SKB_FRAGS)) {
> +			/* Make sure virtnet_xmit_poll sees updated capacity */
> +			wmb();

You only need smp_wmb() since this is protecting against the
guest only as opposed to the host.

Cheers,
Herbert Xu Dec. 16, 2009, 1:35 p.m. UTC | #3
On Wed, Dec 16, 2009 at 03:22:18PM +0200, Michael S. Tsirkin wrote:
>
> Doesn't netif_stop_queue include an atomic op?
> If yes, does not atomic imply wmb already?

Unfortunately it doesn't.

Cheers,
Michael S. Tsirkin Dec. 16, 2009, 1:38 p.m. UTC | #4
On Wed, Dec 16, 2009 at 09:35:12PM +0800, Herbert Xu wrote:
> On Wed, Dec 16, 2009 at 03:22:18PM +0200, Michael S. Tsirkin wrote:
> >
> > Doesn't netif_stop_queue include an atomic op?
> > If yes, does not atomic imply wmb already?
> 
> Unfortunately it doesn't.

So do we need an rmb after testing it as well?
Lock would be simpler :)

> Cheers,
> -- 
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 16, 2009, 1:48 p.m. UTC | #5
On Wed, Dec 16, 2009 at 03:38:07PM +0200, Michael S. Tsirkin wrote:
> On Wed, Dec 16, 2009 at 09:35:12PM +0800, Herbert Xu wrote:
> > On Wed, Dec 16, 2009 at 03:22:18PM +0200, Michael S. Tsirkin wrote:
> > >
> > > Doesn't netif_stop_queue include an atomic op?
> > > If yes, does not atomic imply wmb already?
> > 
> > Unfortunately it doesn't.
> 
> So do we need an rmb after testing it as well?

You're right.  We should have an smp_rmb on the other path.

Cheers,
Sridhar Samudrala Dec. 17, 2009, 1:43 a.m. UTC | #6
On Wed, 2009-12-16 at 23:15 +1030, Rusty Russell wrote:
> On Wed, 16 Dec 2009 01:23:31 pm Herbert Xu wrote:
> > On Wed, Dec 16, 2009 at 01:11:40PM +1030, Rusty Russell wrote:
> > > 
> > >    Thanks for the hint.  They seem to use NAPI for xmit cleanup, so that's
> > > what we should do?  I'll try, but such a rewrite doesn't belong in 2.6.32.
> > 
> > Well it depends.  Real drivers can't touch the hardware so they're
> > stuck with whatever the hardware does.  For virtio we do have the
> > flexibility of modifying the backend.
> > 
> > Having said that, for existing backends that will signal when there
> > is just a single free entry on the queue something like NAPI could
> > reduce the overhead associated with the IRQs.
> 
> OK, this is unfortunately untested, but wanted to send it out tonight:
> 
> virtio_net: use NAPI for xmit (UNTESTED)
> 
> This is closer to the way tg3 and ixgbe do it: use the NAPI framework to
> free transmitted packets.  It neatens things a little as well.

I had to change virtnet_xmit_poll() to get it working. See below.
With this change, i don't see any 'queue full' warnings, but requeues
are still happening at the qdisc level (sch_direct_xmit() finds that
tx queue is stopped and does requeues).

Not sure why the host is not able to keep up with the guest.

Thanks
Sridhar

> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -47,6 +47,9 @@ struct virtnet_info
>  	struct napi_struct napi;
>  	unsigned int status;
> 
> +	/* We free packets and decide whether to restart xmit here. */
> +	struct napi_struct xmit_napi;
> +
>  	/* Number of input buffers, and max we've ever had. */
>  	unsigned int num, max;
> 
> @@ -60,6 +63,9 @@ struct virtnet_info
>  	struct sk_buff_head recv;
>  	struct sk_buff_head send;
> 
> +	/* Capacity left in xmit queue. */
> +	unsigned int capacity;
> +
>  	/* Work struct for refilling if we run low on memory. */
>  	struct delayed_work refill;
> 
> @@ -111,11 +117,8 @@ static void skb_xmit_done(struct virtque
>  {
>  	struct virtnet_info *vi = svq->vdev->priv;
> 
> -	/* Suppress further interrupts. */
> -	svq->vq_ops->disable_cb(svq);
> -
>  	/* We were probably waiting for more output buffers. */
> -	netif_wake_queue(vi->dev);
> +	napi_schedule(&vi->xmit_napi);
>  }
> 
>  static void receive_skb(struct net_device *dev, struct sk_buff *skb,
> @@ -446,7 +449,7 @@ static unsigned int free_old_xmit_skbs(s
> 
>  	while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) {
>  		pr_debug("Sent skb %p\n", skb);
> -		__skb_unlink(skb, &vi->send);
> +		skb_unlink(skb, &vi->send);
>  		vi->dev->stats.tx_bytes += skb->len;
>  		vi->dev->stats.tx_packets++;
>  		tot_sgs += skb_vnet_hdr(skb)->num_sg;
> @@ -455,6 +458,23 @@ static unsigned int free_old_xmit_skbs(s
>  	return tot_sgs;
>  }
> 
> +static int virtnet_xmit_poll(struct napi_struct *xmit_napi, int budget)
> +{
> +	struct virtnet_info *vi =
> +		container_of(xmit_napi, struct virtnet_info, xmit_napi);
> +
> +	if (netif_queue_stopped(vi->dev)) {
> +		vi->capacity += free_old_xmit_skbs(vi);
> +		if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
> +			/* Suppress further xmit interrupts. */
> +			vi->svq->vq_ops->disable_cb(vi->svq);
> +			napi_complete(xmit_napi);
> +			netif_wake_queue(vi->dev);
> +		}
> +	}
> +	return 1;
> +}


+static int virtnet_xmit_poll(struct napi_struct *xmit_napi, int budget)
+{
+	struct virtnet_info *vi =
+		container_of(xmit_napi, struct virtnet_info, xmit_napi);
+
+	vi->capacity += free_old_xmit_skbs(vi);
+
+	if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
+		/* Suppress further xmit interrupts. */
+		vi->svq->vq_ops->disable_cb(vi->svq);
+		napi_complete(xmit_napi);
+		if (netif_queue_stopped(vi->dev))
+			netif_wake_queue(vi->dev);
+	}
+	return 1;
+}



> +
>  static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
>  {
>  	struct scatterlist sg[2+MAX_SKB_FRAGS];
> @@ -509,34 +529,22 @@ static netdev_tx_t start_xmit(struct sk_
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	int capacity;
> 
> -again:
> -	/* Free up any pending old buffers before queueing new ones. */
> -	free_old_xmit_skbs(vi);
> -
>  	/* Try to transmit */
> +	skb_queue_head(&vi->send, skb);
>  	capacity = xmit_skb(vi, skb);
> 
>  	/* This can happen with OOM and indirect buffers. */
>  	if (unlikely(capacity < 0)) {
> +		skb_unlink(skb, &vi->send);
>  		netif_stop_queue(dev);
>  		dev_warn(&dev->dev, "Unexpected full queue\n");
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			vi->svq->vq_ops->disable_cb(vi->svq);
> -			netif_start_queue(dev);
> -			goto again;
> -		}
> +		/* If we missed an interrupt, we let virtnet_xmit_poll deal. */
> +		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
> +			napi_schedule(&vi->xmit_napi);
>  		return NETDEV_TX_BUSY;
>  	}
>  	vi->svq->vq_ops->kick(vi->svq);
> -
> -	/*
> -	 * Put new one in send queue.  You'd expect we'd need this before
> -	 * xmit_skb calls add_buf(), since the callback can be triggered
> -	 * immediately after that.  But since the callback just triggers
> -	 * another call back here, normal network xmit locking prevents the
> -	 * race.
> -	 */
> -	__skb_queue_head(&vi->send, skb);
> +	vi->capacity = capacity;
> 
>  	/* Don't wait up for transmitted skbs to be freed. */
>  	skb_orphan(skb);
> @@ -544,15 +552,16 @@ again:
> 
>  	/* Apparently nice girls don't return TX_BUSY; stop the queue
>  	 * before it gets out of hand.  Naturally, this wastes entries. */
> -	if (capacity < 2+MAX_SKB_FRAGS) {
> -		netif_stop_queue(dev);
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			/* More just got used, free them then recheck. */
> -			capacity += free_old_xmit_skbs(vi);
> -			if (capacity >= 2+MAX_SKB_FRAGS) {
> -				netif_start_queue(dev);
> -				vi->svq->vq_ops->disable_cb(vi->svq);
> -			}
> +	if (unlikely(capacity < 2+MAX_SKB_FRAGS)) {
> +		/* Free old skbs; might make more capacity. */
> +		vi->capacity = capacity + free_old_xmit_skbs(vi);
> +		if (unlikely(vi->capacity < 2+MAX_SKB_FRAGS)) {
> +			/* Make sure virtnet_xmit_poll sees updated capacity */
> +			wmb();
> +			netif_stop_queue(dev);
> +			/* Missed xmit irq? virtnet_xmit_poll will deal. */
> +			if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
> +				napi_schedule(&vi->xmit_napi);
>  		}
>  	}
> 
> @@ -590,6 +599,7 @@ static int virtnet_open(struct net_devic
>  	struct virtnet_info *vi = netdev_priv(dev);
> 
>  	napi_enable(&vi->napi);
> +	napi_enable(&vi->xmit_napi);
> 
>  	/* If all buffers were filled by other side before we napi_enabled, we
>  	 * won't get another interrupt, so process any outstanding packets
> @@ -652,6 +662,7 @@ static int virtnet_close(struct net_devi
>  	struct virtnet_info *vi = netdev_priv(dev);
> 
>  	napi_disable(&vi->napi);
> +	napi_disable(&vi->xmit_napi);
> 
>  	return 0;
>  }
> @@ -883,6 +894,7 @@ static int virtnet_probe(struct virtio_d
>  	/* Set up our device-specific information */
>  	vi = netdev_priv(dev);
>  	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
> +	netif_napi_add(dev, &vi->xmit_napi, virtnet_xmit_poll, 64);
>  	vi->dev = dev;
>  	vi->vdev = vdev;
>  	vdev->priv = vi;

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 17, 2009, 3:12 a.m. UTC | #7
On Wed, Dec 16, 2009 at 05:43:00PM -0800, Sridhar Samudrala wrote:
>
> I had to change virtnet_xmit_poll() to get it working. See below.
> With this change, i don't see any 'queue full' warnings, but requeues
> are still happening at the qdisc level (sch_direct_xmit() finds that
> tx queue is stopped and does requeues).

More importantly how does the performance look?

Thanks,
Herbert Xu Dec. 17, 2009, 3:15 a.m. UTC | #8
On Wed, Dec 16, 2009 at 05:43:00PM -0800, Sridhar Samudrala wrote:
>
> I had to change virtnet_xmit_poll() to get it working. See below.
> With this change, i don't see any 'queue full' warnings, but requeues
> are still happening at the qdisc level (sch_direct_xmit() finds that
> tx queue is stopped and does requeues).

Actually this makes no sense.  The queue should only be stopped
at the end of a xmit run, in which case sch_direct_xmit should
return 0 so we should never see a requeue.

So if we're still seeing requeues then it hasn't been fixed
properly.

Cheers,
Sridhar Samudrala Dec. 17, 2009, 5:02 a.m. UTC | #9
Herbert Xu wrote:
> On Wed, Dec 16, 2009 at 05:43:00PM -0800, Sridhar Samudrala wrote:
>   
>> I had to change virtnet_xmit_poll() to get it working. See below.
>> With this change, i don't see any 'queue full' warnings, but requeues
>> are still happening at the qdisc level (sch_direct_xmit() finds that
>> tx queue is stopped and does requeues).
>>     
>
> More importantly how does the performance look?
>   
TCP stream throughput is around the same 3200Mb/sec

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala Dec. 17, 2009, 5:05 a.m. UTC | #10
Herbert Xu wrote:
> On Wed, Dec 16, 2009 at 05:43:00PM -0800, Sridhar Samudrala wrote:
>   
>> I had to change virtnet_xmit_poll() to get it working. See below.
>> With this change, i don't see any 'queue full' warnings, but requeues
>> are still happening at the qdisc level (sch_direct_xmit() finds that
>> tx queue is stopped and does requeues).
>>     
>
> Actually this makes no sense.  The queue should only be stopped
> at the end of a xmit run, in which case sch_direct_xmit should
> return 0 so we should never see a requeue.
>   
I think sch_direct_xmit() is not even calling dev_hard_start_xmit() as 
the tx queue is stopped
and does a dev_requeue_skb() and returns NETDEV_TX_BUSY.

> So if we're still seeing requeues then it hasn't been fixed
> properly.
>
> Cheers,
>   


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 17, 2009, 6:28 a.m. UTC | #11
On Wed, Dec 16, 2009 at 09:05:32PM -0800, Sridhar Samudrala wrote:
>
> I think sch_direct_xmit() is not even calling dev_hard_start_xmit() as  
> the tx queue is stopped
> and does a dev_requeue_skb() and returns NETDEV_TX_BUSY.

Yes but if the queue was stopped then we shouldn't even get into
sch_direct_xmit.  While inside sch_direct_xmit, then only way for
the queue to be stopped is through dev_hard_start_xmit, and when
that happens sch_direct_xmit will exit so we should never see a
requeue if the driver is working properly.

Cheers,
Sridhar Samudrala Dec. 17, 2009, 6:45 a.m. UTC | #12
Herbert Xu wrote:
> On Wed, Dec 16, 2009 at 09:05:32PM -0800, Sridhar Samudrala wrote:
>   
>> I think sch_direct_xmit() is not even calling dev_hard_start_xmit() as  
>> the tx queue is stopped
>> and does a dev_requeue_skb() and returns NETDEV_TX_BUSY.
>>     
>
> Yes but if the queue was stopped then we shouldn't even get into
> sch_direct_xmit.  
I don't see any checks for txq_stopped in the callers of sch_direct_xmit() :
__dev_xmit_skb() and qdisc_restart().  Both these routines get the txq 
and call
sch_direct_xmit() which checks if tx queue is stopped or frozen.

Am i missing something?

Thanks
Sridhar
> While inside sch_direct_xmit, then only way for
> the queue to be stopped is through dev_hard_start_xmit, and when
> that happens sch_direct_xmit will exit so we should never see a
> requeue if the driver is working properly.
>
> Cheers,
>   


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar Dec. 17, 2009, 10:03 a.m. UTC | #13
> Sridhar Samudrala <sri@us.ibm.com>
>
> Re: [RFC PATCH] Regression in linux 2.6.32 virtio_net seen with vhost-net
>
> Herbert Xu wrote:
> > On Wed, Dec 16, 2009 at 09:05:32PM -0800, Sridhar Samudrala wrote:
> >
> >> I think sch_direct_xmit() is not even calling dev_hard_start_xmit() as

> >> the tx queue is stopped
> >> and does a dev_requeue_skb() and returns NETDEV_TX_BUSY.
> >>
> >
> > Yes but if the queue was stopped then we shouldn't even get into
> > sch_direct_xmit.
> I don't see any checks for txq_stopped in the callers of
sch_direct_xmit() :
> __dev_xmit_skb() and qdisc_restart().  Both these routines get the txq
> and call
> sch_direct_xmit() which checks if tx queue is stopped or frozen.
>
> Am i missing something?

Yes - dequeue_skb.

The final skb, before the queue was stopped, is transmitted by
the driver. The next time sch_direct_xmit is called, it gets a
skb and finds the device is stopped and requeue's the skb. For
all subsequent xmits, dequeue_skb returns NULL (and the other
caller - __dev_xmit_skb can never be called since qdisc_qlen is
true) and thus requeue's will not happen. This also means that
the number of requeues you see (eg 283K in one run) is the number
of times the queue was stopped and restarted. So it looks like
driver either:

1. didn't stop the queue when xmiting a packet successfully (the
      condition being that it would not be possible to xmit the
      next skb). But this doesn't seem to be the case.
2. wrongly restarted the queue. Possible - since a few places
      use both the start & wake queue api's.

Thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Dec. 17, 2009, 11:27 a.m. UTC | #14
On 17-12-2009 11:03, Krishna Kumar2 wrote:
>> Sridhar Samudrala <sri@us.ibm.com>
>>
>> Re: [RFC PATCH] Regression in linux 2.6.32 virtio_net seen with vhost-net
>>
>> Herbert Xu wrote:
>>> On Wed, Dec 16, 2009 at 09:05:32PM -0800, Sridhar Samudrala wrote:
>>>
>>>> I think sch_direct_xmit() is not even calling dev_hard_start_xmit() as
> 
>>>> the tx queue is stopped
>>>> and does a dev_requeue_skb() and returns NETDEV_TX_BUSY.
>>>>
>>> Yes but if the queue was stopped then we shouldn't even get into
>>> sch_direct_xmit.
>> I don't see any checks for txq_stopped in the callers of
> sch_direct_xmit() :
>> __dev_xmit_skb() and qdisc_restart().  Both these routines get the txq
>> and call
>> sch_direct_xmit() which checks if tx queue is stopped or frozen.
>>
>> Am i missing something?
> 
> Yes - dequeue_skb.
> 
> The final skb, before the queue was stopped, is transmitted by
> the driver. The next time sch_direct_xmit is called, it gets a
> skb and finds the device is stopped and requeue's the skb.

So we _should_ get into sch_direct_xmit when the queue was stopped...
I guess Herbert might forget the multiqueue change, and Sridhar isn't
missing much. ;-)

Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 17, 2009, 11:45 a.m. UTC | #15
On Thu, Dec 17, 2009 at 11:27:55AM +0000, Jarek Poplawski wrote:
>
> So we _should_ get into sch_direct_xmit when the queue was stopped...
> I guess Herbert might forget the multiqueue change, and Sridhar isn't
> missing much. ;-)

Hmm, if that's the case then this new multiqueue code is seriously
broken.  It means that once the queue is stopped, every new tx packet
will cause an unnecessary dequeue/requeue, up to 1000.

Cheers,
Herbert Xu Dec. 17, 2009, 11:49 a.m. UTC | #16
On Thu, Dec 17, 2009 at 07:45:35PM +0800, Herbert Xu wrote:
> On Thu, Dec 17, 2009 at 11:27:55AM +0000, Jarek Poplawski wrote:
> >
> > So we _should_ get into sch_direct_xmit when the queue was stopped...
> > I guess Herbert might forget the multiqueue change, and Sridhar isn't
> > missing much. ;-)
> 
> Hmm, if that's the case then this new multiqueue code is seriously
> broken.  It means that once the queue is stopped, every new tx packet
> will cause an unnecessary dequeue/requeue, up to 1000.

And indeed, this new requeue behaviour was introduced in 2.6.32.

Sridhar, please build the 2.6.32 virtio-net driver under a 2.6.31
kernel and see if the regression persists.  Alternatively, build
the virtio-net driver under 2.6.32 and see whether it still performs
as well.

Thanks,
Krishna Kumar Dec. 17, 2009, 11:56 a.m. UTC | #17
> Jarek Poplawski <jarkao2@gmail.com>
>
> >>> On Wed, Dec 16, 2009 at 09:05:32PM -0800, Sridhar Samudrala wrote:
> >>>
> >>>> I think sch_direct_xmit() is not even calling dev_hard_start_xmit()
as
> >
> >>>> the tx queue is stopped
> >>>> and does a dev_requeue_skb() and returns NETDEV_TX_BUSY.
> >>>>
> >>> Yes but if the queue was stopped then we shouldn't even get into
> >>> sch_direct_xmit.
> >> I don't see any checks for txq_stopped in the callers of
> > sch_direct_xmit() :
> >> __dev_xmit_skb() and qdisc_restart().  Both these routines get the txq
> >> and call
> >> sch_direct_xmit() which checks if tx queue is stopped or frozen.
> >>
> >> Am i missing something?
> >
> > Yes - dequeue_skb.
> >
> > The final skb, before the queue was stopped, is transmitted by
> > the driver. The next time sch_direct_xmit is called, it gets a
> > skb and finds the device is stopped and requeue's the skb.
>
> So we _should_ get into sch_direct_xmit when the queue was stopped...
> I guess Herbert might forget the multiqueue change, and Sridhar isn't
> missing much. ;-)

I meant his question on who is checking tx queue stopped before
calling driver xmit. In stopped queue case, qdisc_restart makes
sure sch_direct_xmit is not called for all subsequent skbs.

Sridhar is seeing 280K requeue's, and that probably implies device
was stopped and wrongly restarted immediately. So the next xmit in
the kernel found the txq is not stopped and called the xmit handler,
get a BUSY, requeue, and so on. That would also explain why his BW
drops so much - all false starts (besides 19% of all skbs being
requeued). I assume that each time when we check:

      if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
            ret = dev_hard_start_xmit(skb, dev, txq);
it passes the check and dev_hard_start_xmit is called wrongly.

#Requeues: 283575
#total skbs: 1469482
Percentage requeued: 19.29%

Thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar Dec. 17, 2009, 11:59 a.m. UTC | #18
> Herbert Xu <herbert@gondor.apana.org.au>
> Sent by: netdev-owner@vger.kernel.org
>
> 12/17/2009 05:15 PM
>
> To
>
> Jarek Poplawski <jarkao2@gmail.com>
>
> cc
>
> Krishna Kumar2/India/IBM@IBMIN, Sridhar Samudrala <sri@us.ibm.com>,
> mst@redhat.com, netdev@vger.kernel.org, Rusty Russell
> <rusty@rustcorp.com.au>, "David S. Miller" <davem@davemloft.net>
>
> Subject
>
> Re: [RFC PATCH] Regression in linux 2.6.32 virtio_net seen with vhost-net
>
> On Thu, Dec 17, 2009 at 11:27:55AM +0000, Jarek Poplawski wrote:
> >
> > So we _should_ get into sch_direct_xmit when the queue was stopped...
> > I guess Herbert might forget the multiqueue change, and Sridhar isn't
> > missing much. ;-)
>
> Hmm, if that's the case then this new multiqueue code is seriously
> broken.  It means that once the queue is stopped, every new tx packet
> will cause an unnecessary dequeue/requeue, up to 1000.

I think it is fine. The first skb after the stop is requeue'd. All
subsequent new skbs get screened out at qdisc_restart() -> dequeue_skb
-> NULL -> no work to do. Hope I didn't miss something now :-)

Thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 17, 2009, 12:08 p.m. UTC | #19
On Thu, Dec 17, 2009 at 07:49:37PM +0800, Herbert Xu wrote:
>
> And indeed, this new requeue behaviour was introduced in 2.6.32.

Actually no, it was merely rearranged.  It would appear that this
behaviour has been around for over a year.

It is even worse than I thought.  Not only would new tx packets
trigger this unnecessary queue run, but once it is triggered, it
would consume 100% CPU as dev_requeue_skb unconditionally reschedules
the queue!

Tell me this is not true please...

Thanks,
Jarek Poplawski Dec. 17, 2009, 12:19 p.m. UTC | #20
On Thu, Dec 17, 2009 at 07:45:35PM +0800, Herbert Xu wrote:
> On Thu, Dec 17, 2009 at 11:27:55AM +0000, Jarek Poplawski wrote:
> >
> > So we _should_ get into sch_direct_xmit when the queue was stopped...
> > I guess Herbert might forget the multiqueue change, and Sridhar isn't
> > missing much. ;-)
> 
> Hmm, if that's the case then this new multiqueue code is seriously
> broken.  It means that once the queue is stopped, every new tx packet
> will cause an unnecessary dequeue/requeue, up to 1000.

Hmm, I can even remember who inspired this change ;-) But I'm not sure
this really broken: it seems it's "optimized" for non-stopped case.

Btw, IIRC, Krishna worked on changing this for uniqueues - I wonder if
there could be much difference in testing?

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar Dec. 17, 2009, 12:27 p.m. UTC | #21
Herbert Xu <herbert@gondor.apana.org.au> wrote on 12/17/2009 05:38:56 PM:

> > And indeed, this new requeue behaviour was introduced in 2.6.32.
>
> Actually no, it was merely rearranged.  It would appear that this
> behaviour has been around for over a year.
>
> It is even worse than I thought.  Not only would new tx packets
> trigger this unnecessary queue run, but once it is triggered, it
> would consume 100% CPU as dev_requeue_skb unconditionally reschedules
> the queue!
>
> Tell me this is not true please...

Hi Herbert,

I am confused. Isn't dequeue_skb returning NULL for 2nd - nth skbs
till the queue is restarted, so how is it broken?

__dev_xmit_skb calls sch_direct_xmit only once after the device is
stopped, and this call results in skb being re-queued. Next call to
__dev_xmit_skb calls qdisc_restart which will always bail out since
dev_dequeue returns NULL. We also avoid rescheduling the queue after
the first resched. I also think the resched in requeue is probably
not required, since the device will call netif_tx_wake_queue anyway.

dev_dequeue()
{
      if (unlikely(skb)) {
            struct net_device *dev = qdisc_dev(q);
            struct netdev_queue *txq;

            /* check the reason of requeuing without tx lock first */
            txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
            if (!netif_tx_queue_stopped(txq) &&
                !netif_tx_queue_frozen(txq)) {
                        q->gso_skb = NULL;
                        q->q.qlen--;
            } else
                  skb = NULL;
}

Thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Dec. 17, 2009, 12:42 p.m. UTC | #22
On Thu, Dec 17, 2009 at 05:57:15PM +0530, Krishna Kumar2 wrote:
> the first resched. I also think the resched in requeue is probably
> not required, since the device will call netif_tx_wake_queue anyway.

IIRC there were mentioned some possibly broken drivers around these
changes. So it would require a little bit of reviewing...

Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 17, 2009, 12:56 p.m. UTC | #23
On Thu, Dec 17, 2009 at 12:42:54PM +0000, Jarek Poplawski wrote:
> On Thu, Dec 17, 2009 at 05:57:15PM +0530, Krishna Kumar2 wrote:
> > the first resched. I also think the resched in requeue is probably
> > not required, since the device will call netif_tx_wake_queue anyway.

Actually, the requeue is there for the case where we hit lock
contention on the device TX lock so it is needed for that at
least.

However, for the case where the device queue is stopped you're
right we don't need to reschedule.

Cheers,
Krishna Kumar Dec. 17, 2009, 1:04 p.m. UTC | #24
Jarek Poplawski <jarkao2@gmail.com> wrote on 12/17/2009 06:12:54 PM:
>
> On Thu, Dec 17, 2009 at 05:57:15PM +0530, Krishna Kumar2 wrote:
> > the first resched. I also think the resched in requeue is probably
> > not required, since the device will call netif_tx_wake_queue anyway.
>
> IIRC there were mentioned some possibly broken drivers around these
> changes. So it would require a little bit of reviewing...

Yes, that was the reason I dropped that patch before submitting. I
was not sure of that optimization since I didn't check that all,
esp antique drivers, would resched correctly, but that was only a
minor optimization (avoiding one call of resched per stop).

However, the qdisc_restart/dev_dequeue path is clean since the stopped
txq case is handled very early, and no actual dequeue/requeue/resched
happens till the device restarts the txq.

Thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Dec. 17, 2009, 1:17 p.m. UTC | #25
On Thu, Dec 17, 2009 at 05:26:37PM +0530, Krishna Kumar2 wrote:
> Sridhar is seeing 280K requeue's, and that probably implies device
> was stopped and wrongly restarted immediately. So the next xmit in
> the kernel found the txq is not stopped and called the xmit handler,
> get a BUSY, requeue, and so on. That would also explain why his BW
> drops so much - all false starts (besides 19% of all skbs being
> requeued). I assume that each time when we check:
> 
>       if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
>             ret = dev_hard_start_xmit(skb, dev, txq);
> it passes the check and dev_hard_start_xmit is called wrongly.
> 
> #Requeues: 283575
> #total skbs: 1469482
> Percentage requeued: 19.29%

I haven't followed this thread, so I'm not sure what are you looking
for, but can't these requeues/drops mean some hardware limits were
reached? I wonder why there are compared linux-2.6.32 vs. 2.6.31.6
with different test conditions (avg. packet sizes: 16800 vs. 64400)?

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar Dec. 17, 2009, 1:22 p.m. UTC | #26
Herbert Xu <herbert@gondor.apana.org.au> wrote on 12/17/2009 06:26:54 PM:

>
> > On Thu, Dec 17, 2009 at 05:57:15PM +0530, Krishna Kumar2 wrote:
> > > the first resched. I also think the resched in requeue is probably
> > > not required, since the device will call netif_tx_wake_queue anyway.
>
> Actually, the requeue is there for the case where we hit lock
> contention on the device TX lock so it is needed for that at
> least.
>
> However, for the case where the device queue is stopped you're
> right we don't need to reschedule.

Yes, my patch had a handle for both cases by adding another arg
to dev_requeue_skb, something like:

static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q,
                                  int reason)
{
      q->gso_skb = skb;
      q->qstats.requeues++;
      q->q.qlen++;    /* it's still part of the queue */

      if (reason != NETDEV_TX_BUSY)
            __netif_schedule(q);

      return 0;
}

and pass the reason in the two places that call requeue. I didn't
submit it since it felt a small optimization - avoiding one resched
per stopped txq (not for every skb), plus I was not sure if every
driver would properly call netif_tx_wake_queue.

thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 17, 2009, 1:44 p.m. UTC | #27
On Thu, Dec 17, 2009 at 05:57:15PM +0530, Krishna Kumar2 wrote:
> 
> I am confused. Isn't dequeue_skb returning NULL for 2nd - nth skbs
> till the queue is restarted, so how is it broken?

Sorry I didn't read dev_dequeue carefully enough.  Indeed it
correctly checks the queue status so the loop that I thought
was there doesn't exist.

The requeues are probably caused by the driver still.  Was Sridhar
testing Rusty's latest patch?

Thanks,
Krishna Kumar Dec. 17, 2009, 2:10 p.m. UTC | #28
Jarek Poplawski <jarkao2@gmail.com> wrote on 12/17/2009 06:47:09 PM:

> On Thu, Dec 17, 2009 at 05:26:37PM +0530, Krishna Kumar2 wrote:
> > Sridhar is seeing 280K requeue's, and that probably implies device
> > was stopped and wrongly restarted immediately. So the next xmit in
> > the kernel found the txq is not stopped and called the xmit handler,
> > get a BUSY, requeue, and so on. That would also explain why his BW
> > drops so much - all false starts (besides 19% of all skbs being
> > requeued). I assume that each time when we check:
> >
> >       if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
> >             ret = dev_hard_start_xmit(skb, dev, txq);
> > it passes the check and dev_hard_start_xmit is called wrongly.
> >
> > #Requeues: 283575
> > #total skbs: 1469482
> > Percentage requeued: 19.29%
>
> I haven't followed this thread, so I'm not sure what are you looking
> for, but can't these requeues/drops mean some hardware limits were
> reached? I wonder why there are compared linux-2.6.32 vs. 2.6.31.6
> with different test conditions (avg. packet sizes: 16800 vs. 64400)?

Hi Jarek,

That is a good point. I am not sure why the avg packet sizes are
so different in the bstats. Did GSO change in these two versions?

I took the numbers from Sridhar's mail before the NAPI patch. I think
having 280K requeue's in 1 min means that the driver is waking up the
queue when it should not. The NAPI patch fixes that, but he still
reported seeing requeue's.

thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 17, 2009, 2:16 p.m. UTC | #29
On Thu, Dec 17, 2009 at 07:40:32PM +0530, Krishna Kumar2 wrote:
> 
> I took the numbers from Sridhar's mail before the NAPI patch. I think
> having 280K requeue's in 1 min means that the driver is waking up the
> queue when it should not. The NAPI patch fixes that, but he still
> reported seeing requeue's.

Since we're still in the dark as to why this is happening, it
would be really helpful if we could test 2.6.31's virtio-net
under 2.6.32 or vice versa to determine whether it is a driver
change or a core change that's creating this regression.

Cheers,
Krishna Kumar Dec. 17, 2009, 2:35 p.m. UTC | #30
Herbert Xu <herbert@gondor.apana.org.au> wrote on 12/17/2009 07:14:08 PM:

> > I am confused. Isn't dequeue_skb returning NULL for 2nd - nth skbs
> > till the queue is restarted, so how is it broken?
>
> Sorry I didn't read dev_dequeue carefully enough.  Indeed it
> correctly checks the queue status so the loop that I thought
> was there doesn't exist.
>
> The requeues are probably caused by the driver still.  Was Sridhar
> testing Rusty's latest patch?

I was using numbers from his first test run. He has not posted
the requeue numbers for the NAPI run. But he ran with NAPI, and
said:

"I had to change virtnet_xmit_poll() to get it working. See below.
With this change, i don't see any 'queue full' warnings, but requeues
are still happening at the qdisc level (sch_direct_xmit() finds that
tx queue is stopped and does requeues)".

I think the bug is in this check:

+     if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
+           /* Suppress further xmit interrupts. */
+           vi->svq->vq_ops->disable_cb(vi->svq);
+           napi_complete(xmit_napi);
+
+           /* Don't wake it if link is down. */
+           if (likely(netif_carrier_ok(vi->vdev)))
+                 netif_wake_queue(vi->dev);
+     }

We wake up too fast, just enough space for one more skb to be sent
before the queue is stopped again. And hence no more messages about
queue full, but lot of requeues. The qdisc code is doing the correct
thing, but we need to increase the limit here.

Can we try with some big number, 64, 128?

Thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu Dec. 17, 2009, 2:36 p.m. UTC | #31
On Thu, Dec 17, 2009 at 08:05:02PM +0530, Krishna Kumar2 wrote:
>
> I think the bug is in this check:
> 
> +     if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
> +           /* Suppress further xmit interrupts. */
> +           vi->svq->vq_ops->disable_cb(vi->svq);
> +           napi_complete(xmit_napi);
> +
> +           /* Don't wake it if link is down. */
> +           if (likely(netif_carrier_ok(vi->vdev)))
> +                 netif_wake_queue(vi->dev);
> +     }
> 
> We wake up too fast, just enough space for one more skb to be sent
> before the queue is stopped again. And hence no more messages about
> queue full, but lot of requeues. The qdisc code is doing the correct
> thing, but we need to increase the limit here.
> 
> Can we try with some big number, 64, 128?

Good point.  Sridhar, could you please test doubling or quadrupling
the threshold? 128 is probably a bit too much though as the ring
only has 256 entries.

Cheers,
Sridhar Samudrala Dec. 17, 2009, 9:50 p.m. UTC | #32
On Thu, 2009-12-17 at 22:36 +0800, Herbert Xu wrote:
> On Thu, Dec 17, 2009 at 08:05:02PM +0530, Krishna Kumar2 wrote:
> >
> > I think the bug is in this check:
> > 
> > +     if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
> > +           /* Suppress further xmit interrupts. */
> > +           vi->svq->vq_ops->disable_cb(vi->svq);
> > +           napi_complete(xmit_napi);
> > +
> > +           /* Don't wake it if link is down. */
> > +           if (likely(netif_carrier_ok(vi->vdev)))
> > +                 netif_wake_queue(vi->dev);
> > +     }
> > 
> > We wake up too fast, just enough space for one more skb to be sent
> > before the queue is stopped again. And hence no more messages about
> > queue full, but lot of requeues. The qdisc code is doing the correct
> > thing, but we need to increase the limit here.
> > 
> > Can we try with some big number, 64, 128?
> 
> Good point.  Sridhar, could you please test doubling or quadrupling
> the threshold? 128 is probably a bit too much though as the ring
> only has 256 entries.

Increasing the wakeup threshold value reduced the number of requeues, but
didn't eliminate them. The throughput improved a little, but the CPU utilization
went up.
I don't see any 'queue full' warning messages from the driver and hence the driver
is not returning NETDEV_TX_BUSY. The requeues are happening in sch_direct_xmit()
as it is finding that the tx queue is stopped.

I could not get 2.6.31 virtio-net driver to work with 2.6.32 kernel by simply
replacing virtio-net.c. The compile and build went through fine, but the guest
is not seeing the virtio-net device when it comes up. 
I think it is a driver issue, not a core issue as I am able to get good results
by not stopping the queue early in start_xmit() and dropping the skb when xmit_skb()
fails even with 2.6.32 kernel. I think this behavior is somewhat similar to 2.6.31
virtio-net driver as it caches 1 skb and drops any further skbs when ring is full
in its start_xmit routine. 

2.6.32 + Rusty's xmit_napi v2 patch
-----------------------------------
$ ./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    60.03      3255.22   87.16    82.57    2.193   4.156  
$ tc -s qdisc show dev eth0 
qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 24524210050 bytes 1482737 pkt (dropped 0, overlimits 0 requeues 339101) 
 rate 0bit 0pps backlog 0b 0p requeues 339101 

2.6.32 + Rusty's xmit_napi v2 patch + wakeup threshold=64
---------------------------------------------------------
$ ./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    60.03      3356.71   95.41    89.56    2.329   4.372  
[sridhar@localhost ~]$ tc -s qdisc show dev eth0 
qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 25290227186 bytes 1555119 pkt (dropped 0, overlimits 0 requeues 78179) 
 rate 0bit 0pps backlog 0b 0p requeues 78179 

2.6.32 + Rusty's xmit_napi v2 patch + wakeup threshold=128
----------------------------------------------------------
$./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    60.03      3413.79   96.30    89.79    2.311   4.309  
[sridhar@localhost ~]$ tc -s qdisc show dev eth0 
qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 25719585472 bytes 1579448 pkt (dropped 0, overlimits 0 requeues 40299) 
 rate 0bit 0pps backlog 0b 0p requeues 40299 

2.6.32 + Rusty's xmit_napi v2 patch + don't stop early & drop skb on fail patch
-------------------------------------------------------------------------------
$./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    60.03      7741.65   70.09    72.84    0.742   1.542  
[sridhar@localhost ~]$ tc -s qdisc show dev eth0 
qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 58149531018 bytes 897991 pkt (dropped 0, overlimits 0 requeues 1) 
 rate 0bit 0pps backlog 0b 0p requeues 1



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala Dec. 17, 2009, 10:28 p.m. UTC | #33
On Thu, 2009-12-17 at 13:50 -0800, Sridhar Samudrala wrote:
> On Thu, 2009-12-17 at 22:36 +0800, Herbert Xu wrote:
> > On Thu, Dec 17, 2009 at 08:05:02PM +0530, Krishna Kumar2 wrote:
> > >
> > > I think the bug is in this check:
> > > 
> > > +     if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
> > > +           /* Suppress further xmit interrupts. */
> > > +           vi->svq->vq_ops->disable_cb(vi->svq);
> > > +           napi_complete(xmit_napi);
> > > +
> > > +           /* Don't wake it if link is down. */
> > > +           if (likely(netif_carrier_ok(vi->vdev)))
> > > +                 netif_wake_queue(vi->dev);
> > > +     }
> > > 
> > > We wake up too fast, just enough space for one more skb to be sent
> > > before the queue is stopped again. And hence no more messages about
> > > queue full, but lot of requeues. The qdisc code is doing the correct
> > > thing, but we need to increase the limit here.
> > > 
> > > Can we try with some big number, 64, 128?
> > 
> > Good point.  Sridhar, could you please test doubling or quadrupling
> > the threshold? 128 is probably a bit too much though as the ring
> > only has 256 entries.
> 
> Increasing the wakeup threshold value reduced the number of requeues, but
> didn't eliminate them. The throughput improved a little, but the CPU utilization
> went up.
> I don't see any 'queue full' warning messages from the driver and hence the driver
> is not returning NETDEV_TX_BUSY. The requeues are happening in sch_direct_xmit()
> as it is finding that the tx queue is stopped.
> 
> I could not get 2.6.31 virtio-net driver to work with 2.6.32 kernel by simply
> replacing virtio-net.c. The compile and build went through fine, but the guest
> is not seeing the virtio-net device when it comes up. 
> I think it is a driver issue, not a core issue as I am able to get good results
> by not stopping the queue early in start_xmit() and dropping the skb when xmit_skb()
> fails even with 2.6.32 kernel. I think this behavior is somewhat similar to 2.6.31
> virtio-net driver as it caches 1 skb and drops any further skbs when ring is full
> in its start_xmit routine. 

Another interesting observation that Jarek pointed out is that if we drop skb's, i guess
it is causing TCP to backoff and enabling TCP GSO code to send much larger packets(64K 
compared to 16K). So although the throughput is higher, packets/sec is less in that
case.

> 
> 2.6.32 + Rusty's xmit_napi v2 patch
> -----------------------------------
> $ ./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384  16384    60.03      3255.22   87.16    82.57    2.193   4.156  
> $ tc -s qdisc show dev eth0 
> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>  Sent 24524210050 bytes 1482737 pkt (dropped 0, overlimits 0 requeues 339101) 
>  rate 0bit 0pps backlog 0b 0p requeues 339101 
> 
> 2.6.32 + Rusty's xmit_napi v2 patch + wakeup threshold=64
> ---------------------------------------------------------
> $ ./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384  16384    60.03      3356.71   95.41    89.56    2.329   4.372  
> [sridhar@localhost ~]$ tc -s qdisc show dev eth0 
> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>  Sent 25290227186 bytes 1555119 pkt (dropped 0, overlimits 0 requeues 78179) 
>  rate 0bit 0pps backlog 0b 0p requeues 78179 
> 
> 2.6.32 + Rusty's xmit_napi v2 patch + wakeup threshold=128
> ----------------------------------------------------------
> $./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384  16384    60.03      3413.79   96.30    89.79    2.311   4.309  
> [sridhar@localhost ~]$ tc -s qdisc show dev eth0 
> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>  Sent 25719585472 bytes 1579448 pkt (dropped 0, overlimits 0 requeues 40299) 
>  rate 0bit 0pps backlog 0b 0p requeues 40299 
> 
> 2.6.32 + Rusty's xmit_napi v2 patch + don't stop early & drop skb on fail patch
> -------------------------------------------------------------------------------
> $./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384  16384    60.03      7741.65   70.09    72.84    0.742   1.542  
> [sridhar@localhost ~]$ tc -s qdisc show dev eth0 
> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>  Sent 58149531018 bytes 897991 pkt (dropped 0, overlimits 0 requeues 1) 
>  rate 0bit 0pps backlog 0b 0p requeues 1
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Dec. 17, 2009, 10:41 p.m. UTC | #34
On Thu, Dec 17, 2009 at 02:28:10PM -0800, Sridhar Samudrala wrote:
> Another interesting observation that Jarek pointed out is that if we drop skb's, i guess
> it is causing TCP to backoff and enabling TCP GSO code to send much larger packets(64K 
> compared to 16K). So although the throughput is higher, packets/sec is less in that
> case.

Maybe you're right, but there would be useful to have TCP-independent
tests to compare, or at least without such impact, so with GSO disabled.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar Dec. 18, 2009, 1:46 p.m. UTC | #35
Sridhar Samudrala <sri@us.ibm.com> wrote on 12/18/2009 03:20:08 AM:

> Increasing the wakeup threshold value reduced the number of requeues, but
> didn't eliminate them. The throughput improved a little, but the CPU
> utilization
> went up.
> I don't see any 'queue full' warning messages from the driver and
> hence the driver
> is not returning NETDEV_TX_BUSY. The requeues are happening in
> sch_direct_xmit()
> as it is finding that the tx queue is stopped.
>
> I could not get 2.6.31 virtio-net driver to work with 2.6.32 kernel by
simply
> replacing virtio-net.c. The compile and build went through fine, butthe
guest
> is not seeing the virtio-net device when it comes up.
> I think it is a driver issue, not a core issue as I am able to get
> good results
> by not stopping the queue early in start_xmit() and dropping the skb
> when xmit_skb()
> fails even with 2.6.32 kernel. I think this behavior is somewhat
> similar to 2.6.31
> virtio-net driver as it caches 1 skb and drops any further skbs when
> ring is full
> in its start_xmit routine.
>
> 2.6.32 + Rusty's xmit_napi v2 patch
> -----------------------------------
> $ ./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.
> 122.1 (192.168.122.1) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service
Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send
Recv
> Size   Size    Size     Time     Throughput  local    remote   local
remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB
us/KB
>
>  87380  16384  16384    60.03      3255.22   87.16    82.57    2.193
4.156
> $ tc -s qdisc show dev eth0
> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1
> 1 1 1 1 1 1 1
>  Sent 24524210050 bytes 1482737 pkt (dropped 0, overlimits 0 requeues
339101)
>  rate 0bit 0pps backlog 0b 0p requeues 339101
>
> 2.6.32 + Rusty's xmit_napi v2 patch + wakeup threshold=64
> ---------------------------------------------------------
> $ ./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.
> 122.1 (192.168.122.1) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service
Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send
Recv
> Size   Size    Size     Time     Throughput  local    remote   local
remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB
us/KB
>
>  87380  16384  16384    60.03      3356.71   95.41    89.56    2.329
4.372
> [sridhar@localhost ~]$ tc -s qdisc show dev eth0
> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1
> 1 1 1 1 1 1 1
>  Sent 25290227186 bytes 1555119 pkt (dropped 0, overlimits 0 requeues
78179)
>  rate 0bit 0pps backlog 0b 0p requeues 78179
>
> 2.6.32 + Rusty's xmit_napi v2 patch + wakeup threshold=128
> ----------------------------------------------------------
> $./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.
> 122.1 (192.168.122.1) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service
Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send
Recv
> Size   Size    Size     Time     Throughput  local    remote   local
remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB
us/KB
>
>  87380  16384  16384    60.03      3413.79   96.30    89.79    2.311
4.309
> [sridhar@localhost ~]$ tc -s qdisc show dev eth0
> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1
> 1 1 1 1 1 1 1
>  Sent 25719585472 bytes 1579448 pkt (dropped 0, overlimits 0 requeues
40299)
>  rate 0bit 0pps backlog 0b 0p requeues 40299
>
> 2.6.32 + Rusty's xmit_napi v2 patch + don't stop early & drop skb onfail
patch
>
-------------------------------------------------------------------------------

> $./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.
> 122.1 (192.168.122.1) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service
Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send
Recv
> Size   Size    Size     Time     Throughput  local    remote   local
remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB
us/KB
>
>  87380  16384  16384    60.03      7741.65   70.09    72.84    0.742
1.542
> [sridhar@localhost ~]$ tc -s qdisc show dev eth0
> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0
1__dev_xmit_skb
> 1 1 1 1 1 1 1
>  Sent 58149531018 bytes 897991 pkt (dropped 0, overlimits 0 requeues 1)
>  rate 0bit 0pps backlog 0b 0p requeues 1

Is the "drop skb" patch doing:
-                 return NETDEV_TX_BUSY;
+
+                 /* drop the skb under stress. */
+                 vi->dev->stats.tx_dropped++;
+                 kfree_skb(skb);
+                 return NETDEV_TX_OK;

Why is dropped count zero in the last test case?

sch_direct_xmit is called from two places, and if it finds
the txq stopped, it was called from __dev_xmit_skb (where
the previous sucessful xmit had stopped the queue). This
means the device is still stopping and restarting 1000's
of times a min, and each restart fills up the device h/w
queue with the backlogged skbs resulting in another stop.
Isn't the txqlen set to 1000 in ether_setup? Can you
increase the restart limit to a really high value, like
1/2 or 3/4th of the queue should be empty? Another thing
to test is to simultaneously set txqueuelen to a big value.

Requeue does not seem to be the reason for BW drop since
it barely improved when requeue's reduced from 340K to 40K.
So, as Jarek suggested, GSO could be reason. You could try
testing with 64K I/O size (with GSO enabled) to get
comparable results.

thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala Dec. 18, 2009, 7:13 p.m. UTC | #36
On Fri, 2009-12-18 at 19:16 +0530, Krishna Kumar2 wrote:

> >
> > 2.6.32 + Rusty's xmit_napi v2 patch + don't stop early & drop skb onfail
> patch
> >
> -------------------------------------------------------------------------------
> 
> > $./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60
> > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.
> > 122.1 (192.168.122.1) port 0 AF_INET
> > Recv   Send    Send                          Utilization       Service
> Demand
> > Socket Socket  Message  Elapsed              Send     Recv     Send
> Recv
> > Size   Size    Size     Time     Throughput  local    remote   local
> remote
> > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB
> us/KB
> >
> >  87380  16384  16384    60.03      7741.65   70.09    72.84    0.742
> 1.542
> > [sridhar@localhost ~]$ tc -s qdisc show dev eth0
> > qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0
> 1__dev_xmit_skb
> > 1 1 1 1 1 1 1
> >  Sent 58149531018 bytes 897991 pkt (dropped 0, overlimits 0 requeues 1)
> >  rate 0bit 0pps backlog 0b 0p requeues 1
> 
> Is the "drop skb" patch doing:
> -                 return NETDEV_TX_BUSY;
> +
> +                 /* drop the skb under stress. */
> +                 vi->dev->stats.tx_dropped++;
> +                 kfree_skb(skb);
> +                 return NETDEV_TX_OK;

Yes. This is the patch i used with plain 2.6.32. But with Rusty's patch,
i also commented out the if condition that stops the queue early in
start_xmit().
> 
> Why is dropped count zero in the last test case?

The dropped count reported by 'tc' are drops at the qdisc level and are
counted via qdisc_drop(). The drops at the driver level are counted as
net_device stats and are reported by ip -s link command. I see a few drops(5-6)
in a 60sec run with 2.6.31 kernel.
> 
> sch_direct_xmit is called from two places, and if it finds
> the txq stopped, it was called from __dev_xmit_skb (where
> the previous sucessful xmit had stopped the queue). This
> means the device is still stopping and restarting 1000's
> of times a min, and each restart fills up the device h/w
> queue with the backlogged skbs resulting in another stop.
> Isn't the txqlen set to 1000 in ether_setup? Can you
> increase the restart limit to a really high value, like
> 1/2 or 3/4th of the queue should be empty? Another thing
> to test is to simultaneously set txqueuelen to a big value.

txqueuelen limits the qdisc queue, not the device transmit queue.
The device tx queue length is set by qemu and defaults to 256 for
virtio-net. So a reasonable wakeup threshhold could be 64/128 and
it does reduce the number of requeues.

> 
> Requeue does not seem to be the reason for BW drop since
> it barely improved when requeue's reduced from 340K to 40K.
> So, as Jarek suggested, GSO could be reason. You could try
> testing with 64K I/O size (with GSO enabled) to get
> comparable results.

Yes. with 64K messages, i am getting comparable thruput, in fact
slightly better although cpu utilization is higher. So it looks like
the better thruput with 2.6.31 kernel with 16K message size is a 
side-effect of the drops.

I think Rusty's patch with 1/4 of tx ring as wakeup threshold is the first
step to address the queue full warnings in 2.6.32. With further tuning
it may be possible to eliminate the requeues.

2.6.32 + Rusty's xmit_napi_v2 patch

$ ./netperf -c -C -H 192.168.122.1 -t TCP_STREAM  -l60 -- -m 65536
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.1 (192.168.122.1) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  65536    60.03      8200.80   92.52    91.63    0.924   1.831  
$ tc -s qdisc show dev eth0 
qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 61613336514 bytes 1208233 pkt (dropped 0, overlimits 0 requeues 237750) 
 rate 0bit 0pps backlog 0b 0p requeues 237750 
$ ip -s link show dev eth0
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UNKNOWN qlen 1000
    link/ether 54:52:00:35:e3:74 brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast   
    59348763   899170   0       0       0       0      
    TX: bytes  packets  errors  dropped carrier collsns 
    1483793932 1208230  0       0       0       0      

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -47,6 +47,9 @@  struct virtnet_info
 	struct napi_struct napi;
 	unsigned int status;
 
+	/* We free packets and decide whether to restart xmit here. */
+	struct napi_struct xmit_napi;
+
 	/* Number of input buffers, and max we've ever had. */
 	unsigned int num, max;
 
@@ -60,6 +63,9 @@  struct virtnet_info
 	struct sk_buff_head recv;
 	struct sk_buff_head send;
 
+	/* Capacity left in xmit queue. */
+	unsigned int capacity;
+
 	/* Work struct for refilling if we run low on memory. */
 	struct delayed_work refill;
 
@@ -111,11 +117,8 @@  static void skb_xmit_done(struct virtque
 {
 	struct virtnet_info *vi = svq->vdev->priv;
 
-	/* Suppress further interrupts. */
-	svq->vq_ops->disable_cb(svq);
-
 	/* We were probably waiting for more output buffers. */
-	netif_wake_queue(vi->dev);
+	napi_schedule(&vi->xmit_napi);
 }
 
 static void receive_skb(struct net_device *dev, struct sk_buff *skb,
@@ -446,7 +449,7 @@  static unsigned int free_old_xmit_skbs(s
 
 	while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) {
 		pr_debug("Sent skb %p\n", skb);
-		__skb_unlink(skb, &vi->send);
+		skb_unlink(skb, &vi->send);
 		vi->dev->stats.tx_bytes += skb->len;
 		vi->dev->stats.tx_packets++;
 		tot_sgs += skb_vnet_hdr(skb)->num_sg;
@@ -455,6 +458,23 @@  static unsigned int free_old_xmit_skbs(s
 	return tot_sgs;
 }
 
+static int virtnet_xmit_poll(struct napi_struct *xmit_napi, int budget)
+{
+	struct virtnet_info *vi =
+		container_of(xmit_napi, struct virtnet_info, xmit_napi);
+
+	if (netif_queue_stopped(vi->dev)) {
+		vi->capacity += free_old_xmit_skbs(vi);
+		if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
+			/* Suppress further xmit interrupts. */
+			vi->svq->vq_ops->disable_cb(vi->svq);
+			napi_complete(xmit_napi);
+			netif_wake_queue(vi->dev);
+		}
+	}
+	return 1;
+}
+
 static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 {
 	struct scatterlist sg[2+MAX_SKB_FRAGS];
@@ -509,34 +529,22 @@  static netdev_tx_t start_xmit(struct sk_
 	struct virtnet_info *vi = netdev_priv(dev);
 	int capacity;
 
-again:
-	/* Free up any pending old buffers before queueing new ones. */
-	free_old_xmit_skbs(vi);
-
 	/* Try to transmit */
+	skb_queue_head(&vi->send, skb);
 	capacity = xmit_skb(vi, skb);
 
 	/* This can happen with OOM and indirect buffers. */
 	if (unlikely(capacity < 0)) {
+		skb_unlink(skb, &vi->send);
 		netif_stop_queue(dev);
 		dev_warn(&dev->dev, "Unexpected full queue\n");
-		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
-			vi->svq->vq_ops->disable_cb(vi->svq);
-			netif_start_queue(dev);
-			goto again;
-		}
+		/* If we missed an interrupt, we let virtnet_xmit_poll deal. */
+		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
+			napi_schedule(&vi->xmit_napi);
 		return NETDEV_TX_BUSY;
 	}
 	vi->svq->vq_ops->kick(vi->svq);
-
-	/*
-	 * Put new one in send queue.  You'd expect we'd need this before
-	 * xmit_skb calls add_buf(), since the callback can be triggered
-	 * immediately after that.  But since the callback just triggers
-	 * another call back here, normal network xmit locking prevents the
-	 * race.
-	 */
-	__skb_queue_head(&vi->send, skb);
+	vi->capacity = capacity;
 
 	/* Don't wait up for transmitted skbs to be freed. */
 	skb_orphan(skb);
@@ -544,15 +552,16 @@  again:
 
 	/* Apparently nice girls don't return TX_BUSY; stop the queue
 	 * before it gets out of hand.  Naturally, this wastes entries. */
-	if (capacity < 2+MAX_SKB_FRAGS) {
-		netif_stop_queue(dev);
-		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
-			/* More just got used, free them then recheck. */
-			capacity += free_old_xmit_skbs(vi);
-			if (capacity >= 2+MAX_SKB_FRAGS) {
-				netif_start_queue(dev);
-				vi->svq->vq_ops->disable_cb(vi->svq);
-			}
+	if (unlikely(capacity < 2+MAX_SKB_FRAGS)) {
+		/* Free old skbs; might make more capacity. */
+		vi->capacity = capacity + free_old_xmit_skbs(vi);
+		if (unlikely(vi->capacity < 2+MAX_SKB_FRAGS)) {
+			/* Make sure virtnet_xmit_poll sees updated capacity */
+			wmb();
+			netif_stop_queue(dev);
+			/* Missed xmit irq? virtnet_xmit_poll will deal. */
+			if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
+				napi_schedule(&vi->xmit_napi);
 		}
 	}
 
@@ -590,6 +599,7 @@  static int virtnet_open(struct net_devic
 	struct virtnet_info *vi = netdev_priv(dev);
 
 	napi_enable(&vi->napi);
+	napi_enable(&vi->xmit_napi);
 
 	/* If all buffers were filled by other side before we napi_enabled, we
 	 * won't get another interrupt, so process any outstanding packets
@@ -652,6 +662,7 @@  static int virtnet_close(struct net_devi
 	struct virtnet_info *vi = netdev_priv(dev);
 
 	napi_disable(&vi->napi);
+	napi_disable(&vi->xmit_napi);
 
 	return 0;
 }
@@ -883,6 +894,7 @@  static int virtnet_probe(struct virtio_d
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
 	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
+	netif_napi_add(dev, &vi->xmit_napi, virtnet_xmit_poll, 64);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;