diff mbox

[2/2] virtio_net: remove send completion interrupts and avoid TX queue overrun through packet drop

Message ID 1300320775.3255.34.camel@localhost.localdomain
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Shirley Ma March 17, 2011, 12:12 a.m. UTC
Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 drivers/net/virtio_net.c |   39 ++++++++++++++++++++-------------------
 1 files changed, 20 insertions(+), 19 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin March 17, 2011, 5:02 a.m. UTC | #1
On Wed, Mar 16, 2011 at 05:12:55PM -0700, Shirley Ma wrote:
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>  drivers/net/virtio_net.c |   39 ++++++++++++++++++++-------------------
>  1 files changed, 20 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 82dba5a..c603daa 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -23,6 +23,7 @@
>  #include <linux/module.h>
>  #include <linux/virtio.h>
>  #include <linux/virtio_net.h>
> +#include <linux/virtio_ring.h>
>  #include <linux/scatterlist.h>
>  #include <linux/if_vlan.h>
>  #include <linux/slab.h>
> @@ -509,19 +510,18 @@ again:
>  	return received;
>  }
>  
> -static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
> +static void free_old_xmit_skbs(struct virtnet_info *vi)
>  {
>  	struct sk_buff *skb;
> -	unsigned int len, tot_sgs = 0;
> +	unsigned int len;
>  
>  	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
>  		pr_debug("Sent skb %p\n", skb);
>  		vi->dev->stats.tx_bytes += skb->len;
>  		vi->dev->stats.tx_packets++;
> -		tot_sgs += skb_vnet_hdr(skb)->num_sg;
>  		dev_kfree_skb_any(skb);
>  	}
> -	return tot_sgs;
> +	return;
>  }
>  
>  static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
> @@ -574,11 +574,26 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
>  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> +	bool drop;
> +	bool indirect = virtio_has_feature(vi->vdev,
> +					   VIRTIO_RING_F_INDIRECT_DESC);
>  	int capacity;
>  
>  	/* Free up any pending old buffers before queueing new ones. */
>  	free_old_xmit_skbs(vi);
> -
> +	capacity = virtqueue_get_capacity(vi->svq);
> +
> +	/* Drop packet instead of stop queue for better performance */
> +	drop = (capacity == 0 && indirect) ||
> +	       ((capacity < MAX_SKB_FRAGS + 2) && !indirect);
> +	if (unlikely(drop)) {
> +		dev->stats.tx_fifo_errors++;
> +		dev_warn(&dev->dev, "Unexpected TX queue failure: %d\n",
> +			 capacity);
> +		dev->stats.tx_dropped++;
> +		kfree_skb(skb);
> +		return NETDEV_TX_OK;
> +	}
>  	/* Try to transmit */
>  	capacity = xmit_skb(vi, skb);

So, this just tries to make sure there's enough space for
max packet in the ring, if not - drop and return OK.
Why bother checking beforehand though?
If that's what we want to do, we can just call add_buf and see
if it fails?


> @@ -605,20 +620,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	skb_orphan(skb);
>  	nf_reset(skb);
>  
> -	/* Apparently nice girls don't return TX_BUSY; stop the queue
> -	 * before it gets out of hand.  Naturally, this wastes entries. */
> -	if (capacity < 2+MAX_SKB_FRAGS) {
> -		netif_stop_queue(dev);
> -		if (unlikely(!virtqueue_enable_cb(vi->svq))) {
> -			/* More just got used, free them then recheck. */
> -			capacity += free_old_xmit_skbs(vi);
> -			if (capacity >= 2+MAX_SKB_FRAGS) {
> -				netif_start_queue(dev);
> -				virtqueue_disable_cb(vi->svq);
> -			}
> -		}
> -	}
> -
>  	return NETDEV_TX_OK;
>  }
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell March 17, 2011, 5:10 a.m. UTC | #2
On Wed, 16 Mar 2011 17:12:55 -0700, Shirley Ma <mashirle@us.ibm.com> wrote:
> Signed-off-by: Shirley Ma <xma@us.ibm.com>

This is fascinating... and deeply weird.

OK, what's the difference between calling xmit_skb and ignoring failure,
and this patch which figures out it's going to fail before calling
xmit_skb?

ie. what if you *just* delete this:

> @@ -605,20 +620,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	skb_orphan(skb);
>  	nf_reset(skb);
>  
> -	/* Apparently nice girls don't return TX_BUSY; stop the queue
> -	 * before it gets out of hand.  Naturally, this wastes entries. */
> -	if (capacity < 2+MAX_SKB_FRAGS) {
> -		netif_stop_queue(dev);
> -		if (unlikely(!virtqueue_enable_cb(vi->svq))) {
> -			/* More just got used, free them then recheck. */
> -			capacity += free_old_xmit_skbs(vi);
> -			if (capacity >= 2+MAX_SKB_FRAGS) {
> -				netif_start_queue(dev);
> -				virtqueue_disable_cb(vi->svq);
> -			}
> -		}
> -	}
> -
>  	return NETDEV_TX_OK;
>  }

Thanks!
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma March 17, 2011, 3:10 p.m. UTC | #3
On Thu, 2011-03-17 at 15:40 +1030, Rusty Russell wrote:
> This is fascinating... and deeply weird.
> 
> OK, what's the difference between calling xmit_skb and ignoring
> failure,
> and this patch which figures out it's going to fail before calling
> xmit_skb?
> 
> ie. what if you *just* delete this:

Somehow what was in my mind, add_buf is more expensive than simply
checked ring capacity. I retest it with your and Michael's suggestion
here.

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma March 17, 2011, 3:18 p.m. UTC | #4
On Thu, 2011-03-17 at 07:02 +0200, Michael S. Tsirkin wrote:
> So, this just tries to make sure there's enough space for
> max packet in the ring, if not - drop and return OK.
> Why bother checking beforehand though?
> If that's what we want to do, we can just call add_buf and see
> if it fails?

In add_buf, there is an additional kick, see below. I added check
capacity to avoid this, thought it would be better performance. I will
retest it w/i add_buf to see the performance difference.

        if (vq->num_free < out + in) {
                pr_debug("Can't add buf len %i - avail = %i\n",
                         out + in, vq->num_free);
                /* FIXME: for historical reasons, we force a notify here
if
                 * there are outgoing parts to the buffer.  Presumably
the
                 * host should service the ring ASAP. */
                if (out)
                        vq->notify(&vq->vq);
                END_USE(vq);
                return -ENOSPC;
        }

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma March 18, 2011, 3:28 a.m. UTC | #5
On Thu, 2011-03-17 at 08:18 -0700, Shirley Ma wrote:
> On Thu, 2011-03-17 at 07:02 +0200, Michael S. Tsirkin wrote:
> > So, this just tries to make sure there's enough space for
> > max packet in the ring, if not - drop and return OK.
> > Why bother checking beforehand though?
> > If that's what we want to do, we can just call add_buf and see
> > if it fails?
> 
> In add_buf, there is an additional kick, see below. I added check
> capacity to avoid this, thought it would be better performance. I will
> retest it w/i add_buf to see the performance difference.
> 
>         if (vq->num_free < out + in) {
>                 pr_debug("Can't add buf len %i - avail = %i\n",
>                          out + in, vq->num_free);
>                 /* FIXME: for historical reasons, we force a notify
> here
> if
>                  * there are outgoing parts to the buffer.  Presumably
> the
>                  * host should service the ring ASAP. */
>                 if (out)
>                         vq->notify(&vq->vq);
>                 END_USE(vq);
>                 return -ENOSPC;
>         }
> 

More test results:

UDP_STREAM test results (% is guest vcpu, guest has 2 vpus):

Send(netperf)
----

size	2.6.38-rc8	2.6.38-rc8+	2.6.38-rc8
			addbuf failure	check capacity
-----------------------------------------------------
1K	1541.0/50.14%	2169.1/50.03%	3018.9/50%
2K	1649.7/33.74%	3362.6/50.18%	4518.8/50.47%	
4K	2957.8/44.83%	5965.9/50.03%	9592.5/50%
8K	3788/39.01%	9852.8/50.25%	15483.8/50%
16K	4736.1/34.13%	14946.5/50.01%	21645.0/50%

Recv(netserver w/i recv errors)
----
1K	1541/8.36%	1554.4/9.67%	1675.1/11.26%
2K	1649.7/33.4%	1945.5/5.59%	2160.6/5.99%
4K	2556.3/5.07%	3044.8/7.12%	3118.9/7.86%
8K	3775/39.01%	4361/9/9.14%	4017.1/7.89%
16K	4466.4/8.56%	4435.2/10.95%	4446.8/9.92%

TCP_STREAM test results (% is guest vcpu, guest has two vcpus):

size	2.6.38-rc8	2.6.38-rc8+	2.6.38-rc8
			addbuf failure	check capacity
------------------------------------------------------
1K	2381.10/42.49%	5686.01/50.08%	5599.15/50.42%
2K	3205.08/49.18%	8675.93/48.59%	9241.86/48.42%	
4K	5231.24/34.12%	9309.67/42.07%	9321.87/40.94%
8K	7952.74/35.85%	9001.95/38.26%	9265.45/37.63%
16K	8260.68/35.07%	7911.52/34.35%	8310.29/34.28%
64K	9103.75/28.98%	9219.12/31.52%	9094.38/29.44%

qemu-kvm host cpu utilization also saved for both addbuf failure and
check capacity patches, vhost cpu utilization are similar in all case.

Looks like the additional guest notify in add_buf doesn't cost that much
than I thought to be.

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin March 18, 2011, 1:15 p.m. UTC | #6
On Thu, Mar 17, 2011 at 08:28:47PM -0700, Shirley Ma wrote:
> On Thu, 2011-03-17 at 08:18 -0700, Shirley Ma wrote:
> > On Thu, 2011-03-17 at 07:02 +0200, Michael S. Tsirkin wrote:
> > > So, this just tries to make sure there's enough space for
> > > max packet in the ring, if not - drop and return OK.
> > > Why bother checking beforehand though?
> > > If that's what we want to do, we can just call add_buf and see
> > > if it fails?
> > 
> > In add_buf, there is an additional kick, see below. I added check
> > capacity to avoid this, thought it would be better performance. I will
> > retest it w/i add_buf to see the performance difference.
> > 
> >         if (vq->num_free < out + in) {
> >                 pr_debug("Can't add buf len %i - avail = %i\n",
> >                          out + in, vq->num_free);
> >                 /* FIXME: for historical reasons, we force a notify
> > here
> > if
> >                  * there are outgoing parts to the buffer.  Presumably
> > the
> >                  * host should service the ring ASAP. */
> >                 if (out)
> >                         vq->notify(&vq->vq);
> >                 END_USE(vq);
> >                 return -ENOSPC;
> >         }
> > 

Rusty, could you pls clarify what are the historical reasons here?
Are they still valid?
If yes we could dedicate a feature flag to disabling this,
or guess that the host is new by looking at some
other feature flag.

> More test results:
> 
> UDP_STREAM test results (% is guest vcpu, guest has 2 vpus):
> 
> Send(netperf)
> ----
> 
> size	2.6.38-rc8	2.6.38-rc8+	2.6.38-rc8
> 			addbuf failure	check capacity
> -----------------------------------------------------
> 1K	1541.0/50.14%	2169.1/50.03%	3018.9/50%
> 2K	1649.7/33.74%	3362.6/50.18%	4518.8/50.47%	
> 4K	2957.8/44.83%	5965.9/50.03%	9592.5/50%
> 8K	3788/39.01%	9852.8/50.25%	15483.8/50%
> 16K	4736.1/34.13%	14946.5/50.01%	21645.0/50%

Is this the local or remote throughput?
With UDP_STREAM you are mostly interested in
remote throughput, local one can be pretty high
while most packets get dropped.

> Looks like the additional guest notify in add_buf doesn't cost that much
> than I thought to be.
> 
> Thanks
> Shirley
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu March 18, 2011, 1:33 p.m. UTC | #7
Shirley Ma <mashirle@us.ibm.com> wrote:
>
> +       /* Drop packet instead of stop queue for better performance */

I would like to see some justification as to why this is the right
way to go and not just papering over the real problem.

Thanks,
Shirley Ma March 18, 2011, 4:54 p.m. UTC | #8
On Fri, 2011-03-18 at 15:15 +0200, Michael S. Tsirkin wrote:
> Is this the local or remote throughput?
> With UDP_STREAM you are mostly interested in
> remote throughput, local one can be pretty high
> while most packets get dropped. 

This is local throughput. Remote is called recv(netserver) data. With
default wmem/rmem, there are so many recv errors. I haven't tuned the
results yet.

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma March 19, 2011, 1:41 a.m. UTC | #9
On Fri, 2011-03-18 at 08:33 -0500, Herbert Xu wrote:
> Shirley Ma <mashirle@us.ibm.com> wrote:
> >
> > +       /* Drop packet instead of stop queue for better performance
> */
> 
> I would like to see some justification as to why this is the right
> way to go and not just papering over the real problem. 

Fair. KVM guest virtio_net TX queue stop/restart is pretty expensive,
which involves:

1. Guest enable callback: one memory barrier, interrupt flag set

2. Host signals guest: one memory barrier, and a TX interrupt from host
to KVM guest through evenfd_signal.


Most of the workload so far we barely see TX over run, except for small
messages TCP_STREAM. 

For small message size TCP_STREAM workload, no matter how big the TX
queue size is, it always causes overrun. I even re-enable the TX queue
when it's empty, it still hits TX overrun again and again.

Somehow KVM guest and host is not in pace on processing small packets. I
tried to pin each thread to different CPU, it didn't help. So it didn't
seem to be scheduling related.

>From the performance results, we can see dramatically performance gain
with this patch.

I would like to dig out the real reason why host can't in pace with
guest, but haven't figured it out in month, that's the reason I held
this patch for a while. However if anyone can give me any ideas on how
to debug the real problem, I am willing to try it out. 

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma March 21, 2011, 6:03 p.m. UTC | #10
On Fri, 2011-03-18 at 18:41 -0700, Shirley Ma wrote:
> > > +       /* Drop packet instead of stop queue for better
> performance
> > */
> > 
> > I would like to see some justification as to why this is the right
> > way to go and not just papering over the real problem. 
> 
> Fair. KVM guest virtio_net TX queue stop/restart is pretty expensive,
> which involves:
> 
> 1. Guest enable callback: one memory barrier, interrupt flag set

Missed this cost: for history reason, it also involves a guest exit from
I/O write (PCI_QUEUE_NOTIFY).

> 2. Host signals guest: one memory barrier, and a TX interrupt from
> host
> to KVM guest through evenfd_signal.
> 
> 
> Most of the workload so far we barely see TX over run, except for
> small
> messages TCP_STREAM. 
> 
> For small message size TCP_STREAM workload, no matter how big the TX
> queue size is, it always causes overrun. I even re-enable the TX queue
> when it's empty, it still hits TX overrun again and again.
> 
> Somehow KVM guest and host is not in pace on processing small packets.
> I
> tried to pin each thread to different CPU, it didn't help. So it
> didn't
> seem to be scheduling related.
> 
> >From the performance results, we can see dramatically performance
> gain
> with this patch.
> 
> I would like to dig out the real reason why host can't in pace with
> guest, but haven't figured it out in month, that's the reason I held
> this patch for a while. However if anyone can give me any ideas on how
> to debug the real problem, I am willing to try it out. 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 82dba5a..c603daa 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -23,6 +23,7 @@ 
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_net.h>
+#include <linux/virtio_ring.h>
 #include <linux/scatterlist.h>
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
@@ -509,19 +510,18 @@  again:
 	return received;
 }
 
-static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
+static void free_old_xmit_skbs(struct virtnet_info *vi)
 {
 	struct sk_buff *skb;
-	unsigned int len, tot_sgs = 0;
+	unsigned int len;
 
 	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
 		pr_debug("Sent skb %p\n", skb);
 		vi->dev->stats.tx_bytes += skb->len;
 		vi->dev->stats.tx_packets++;
-		tot_sgs += skb_vnet_hdr(skb)->num_sg;
 		dev_kfree_skb_any(skb);
 	}
-	return tot_sgs;
+	return;
 }
 
 static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
@@ -574,11 +574,26 @@  static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	bool drop;
+	bool indirect = virtio_has_feature(vi->vdev,
+					   VIRTIO_RING_F_INDIRECT_DESC);
 	int capacity;
 
 	/* Free up any pending old buffers before queueing new ones. */
 	free_old_xmit_skbs(vi);
-
+	capacity = virtqueue_get_capacity(vi->svq);
+
+	/* Drop packet instead of stop queue for better performance */
+	drop = (capacity == 0 && indirect) ||
+	       ((capacity < MAX_SKB_FRAGS + 2) && !indirect);
+	if (unlikely(drop)) {
+		dev->stats.tx_fifo_errors++;
+		dev_warn(&dev->dev, "Unexpected TX queue failure: %d\n",
+			 capacity);
+		dev->stats.tx_dropped++;
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
 	/* Try to transmit */
 	capacity = xmit_skb(vi, skb);
 
@@ -605,20 +620,6 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_orphan(skb);
 	nf_reset(skb);
 
-	/* Apparently nice girls don't return TX_BUSY; stop the queue
-	 * before it gets out of hand.  Naturally, this wastes entries. */
-	if (capacity < 2+MAX_SKB_FRAGS) {
-		netif_stop_queue(dev);
-		if (unlikely(!virtqueue_enable_cb(vi->svq))) {
-			/* More just got used, free them then recheck. */
-			capacity += free_old_xmit_skbs(vi);
-			if (capacity >= 2+MAX_SKB_FRAGS) {
-				netif_start_queue(dev);
-				virtqueue_disable_cb(vi->svq);
-			}
-		}
-	}
-
 	return NETDEV_TX_OK;
 }