Message ID | 1300320775.3255.34.camel@localhost.localdomain |
---|---|
State | Not Applicable, archived |
Delegated to: | David Miller |
Headers | show |
On Wed, Mar 16, 2011 at 05:12:55PM -0700, Shirley Ma wrote: > Signed-off-by: Shirley Ma <xma@us.ibm.com> > --- > drivers/net/virtio_net.c | 39 ++++++++++++++++++++------------------- > 1 files changed, 20 insertions(+), 19 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 82dba5a..c603daa 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -23,6 +23,7 @@ > #include <linux/module.h> > #include <linux/virtio.h> > #include <linux/virtio_net.h> > +#include <linux/virtio_ring.h> > #include <linux/scatterlist.h> > #include <linux/if_vlan.h> > #include <linux/slab.h> > @@ -509,19 +510,18 @@ again: > return received; > } > > -static unsigned int free_old_xmit_skbs(struct virtnet_info *vi) > +static void free_old_xmit_skbs(struct virtnet_info *vi) > { > struct sk_buff *skb; > - unsigned int len, tot_sgs = 0; > + unsigned int len; > > while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) { > pr_debug("Sent skb %p\n", skb); > vi->dev->stats.tx_bytes += skb->len; > vi->dev->stats.tx_packets++; > - tot_sgs += skb_vnet_hdr(skb)->num_sg; > dev_kfree_skb_any(skb); > } > - return tot_sgs; > + return; > } > > static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) > @@ -574,11 +574,26 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) > static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > { > struct virtnet_info *vi = netdev_priv(dev); > + bool drop; > + bool indirect = virtio_has_feature(vi->vdev, > + VIRTIO_RING_F_INDIRECT_DESC); > int capacity; > > /* Free up any pending old buffers before queueing new ones. */ > free_old_xmit_skbs(vi); > - > + capacity = virtqueue_get_capacity(vi->svq); > + > + /* Drop packet instead of stop queue for better performance */ > + drop = (capacity == 0 && indirect) || > + ((capacity < MAX_SKB_FRAGS + 2) && !indirect); > + if (unlikely(drop)) { > + dev->stats.tx_fifo_errors++; > + dev_warn(&dev->dev, "Unexpected TX queue failure: %d\n", > + capacity); > + dev->stats.tx_dropped++; > + kfree_skb(skb); > + return NETDEV_TX_OK; > + } > /* Try to transmit */ > capacity = xmit_skb(vi, skb); So, this just tries to make sure there's enough space for max packet in the ring, if not - drop and return OK. Why bother checking beforehand though? If that's what we want to do, we can just call add_buf and see if it fails? > @@ -605,20 +620,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > skb_orphan(skb); > nf_reset(skb); > > - /* Apparently nice girls don't return TX_BUSY; stop the queue > - * before it gets out of hand. Naturally, this wastes entries. */ > - if (capacity < 2+MAX_SKB_FRAGS) { > - netif_stop_queue(dev); > - if (unlikely(!virtqueue_enable_cb(vi->svq))) { > - /* More just got used, free them then recheck. */ > - capacity += free_old_xmit_skbs(vi); > - if (capacity >= 2+MAX_SKB_FRAGS) { > - netif_start_queue(dev); > - virtqueue_disable_cb(vi->svq); > - } > - } > - } > - > return NETDEV_TX_OK; > } > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 16 Mar 2011 17:12:55 -0700, Shirley Ma <mashirle@us.ibm.com> wrote: > Signed-off-by: Shirley Ma <xma@us.ibm.com> This is fascinating... and deeply weird. OK, what's the difference between calling xmit_skb and ignoring failure, and this patch which figures out it's going to fail before calling xmit_skb? ie. what if you *just* delete this: > @@ -605,20 +620,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > skb_orphan(skb); > nf_reset(skb); > > - /* Apparently nice girls don't return TX_BUSY; stop the queue > - * before it gets out of hand. Naturally, this wastes entries. */ > - if (capacity < 2+MAX_SKB_FRAGS) { > - netif_stop_queue(dev); > - if (unlikely(!virtqueue_enable_cb(vi->svq))) { > - /* More just got used, free them then recheck. */ > - capacity += free_old_xmit_skbs(vi); > - if (capacity >= 2+MAX_SKB_FRAGS) { > - netif_start_queue(dev); > - virtqueue_disable_cb(vi->svq); > - } > - } > - } > - > return NETDEV_TX_OK; > } Thanks! Rusty. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2011-03-17 at 15:40 +1030, Rusty Russell wrote: > This is fascinating... and deeply weird. > > OK, what's the difference between calling xmit_skb and ignoring > failure, > and this patch which figures out it's going to fail before calling > xmit_skb? > > ie. what if you *just* delete this: Somehow what was in my mind, add_buf is more expensive than simply checked ring capacity. I retest it with your and Michael's suggestion here. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2011-03-17 at 07:02 +0200, Michael S. Tsirkin wrote: > So, this just tries to make sure there's enough space for > max packet in the ring, if not - drop and return OK. > Why bother checking beforehand though? > If that's what we want to do, we can just call add_buf and see > if it fails? In add_buf, there is an additional kick, see below. I added check capacity to avoid this, thought it would be better performance. I will retest it w/i add_buf to see the performance difference. if (vq->num_free < out + in) { pr_debug("Can't add buf len %i - avail = %i\n", out + in, vq->num_free); /* FIXME: for historical reasons, we force a notify here if * there are outgoing parts to the buffer. Presumably the * host should service the ring ASAP. */ if (out) vq->notify(&vq->vq); END_USE(vq); return -ENOSPC; } Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2011-03-17 at 08:18 -0700, Shirley Ma wrote: > On Thu, 2011-03-17 at 07:02 +0200, Michael S. Tsirkin wrote: > > So, this just tries to make sure there's enough space for > > max packet in the ring, if not - drop and return OK. > > Why bother checking beforehand though? > > If that's what we want to do, we can just call add_buf and see > > if it fails? > > In add_buf, there is an additional kick, see below. I added check > capacity to avoid this, thought it would be better performance. I will > retest it w/i add_buf to see the performance difference. > > if (vq->num_free < out + in) { > pr_debug("Can't add buf len %i - avail = %i\n", > out + in, vq->num_free); > /* FIXME: for historical reasons, we force a notify > here > if > * there are outgoing parts to the buffer. Presumably > the > * host should service the ring ASAP. */ > if (out) > vq->notify(&vq->vq); > END_USE(vq); > return -ENOSPC; > } > More test results: UDP_STREAM test results (% is guest vcpu, guest has 2 vpus): Send(netperf) ---- size 2.6.38-rc8 2.6.38-rc8+ 2.6.38-rc8 addbuf failure check capacity ----------------------------------------------------- 1K 1541.0/50.14% 2169.1/50.03% 3018.9/50% 2K 1649.7/33.74% 3362.6/50.18% 4518.8/50.47% 4K 2957.8/44.83% 5965.9/50.03% 9592.5/50% 8K 3788/39.01% 9852.8/50.25% 15483.8/50% 16K 4736.1/34.13% 14946.5/50.01% 21645.0/50% Recv(netserver w/i recv errors) ---- 1K 1541/8.36% 1554.4/9.67% 1675.1/11.26% 2K 1649.7/33.4% 1945.5/5.59% 2160.6/5.99% 4K 2556.3/5.07% 3044.8/7.12% 3118.9/7.86% 8K 3775/39.01% 4361/9/9.14% 4017.1/7.89% 16K 4466.4/8.56% 4435.2/10.95% 4446.8/9.92% TCP_STREAM test results (% is guest vcpu, guest has two vcpus): size 2.6.38-rc8 2.6.38-rc8+ 2.6.38-rc8 addbuf failure check capacity ------------------------------------------------------ 1K 2381.10/42.49% 5686.01/50.08% 5599.15/50.42% 2K 3205.08/49.18% 8675.93/48.59% 9241.86/48.42% 4K 5231.24/34.12% 9309.67/42.07% 9321.87/40.94% 8K 7952.74/35.85% 9001.95/38.26% 9265.45/37.63% 16K 8260.68/35.07% 7911.52/34.35% 8310.29/34.28% 64K 9103.75/28.98% 9219.12/31.52% 9094.38/29.44% qemu-kvm host cpu utilization also saved for both addbuf failure and check capacity patches, vhost cpu utilization are similar in all case. Looks like the additional guest notify in add_buf doesn't cost that much than I thought to be. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Mar 17, 2011 at 08:28:47PM -0700, Shirley Ma wrote: > On Thu, 2011-03-17 at 08:18 -0700, Shirley Ma wrote: > > On Thu, 2011-03-17 at 07:02 +0200, Michael S. Tsirkin wrote: > > > So, this just tries to make sure there's enough space for > > > max packet in the ring, if not - drop and return OK. > > > Why bother checking beforehand though? > > > If that's what we want to do, we can just call add_buf and see > > > if it fails? > > > > In add_buf, there is an additional kick, see below. I added check > > capacity to avoid this, thought it would be better performance. I will > > retest it w/i add_buf to see the performance difference. > > > > if (vq->num_free < out + in) { > > pr_debug("Can't add buf len %i - avail = %i\n", > > out + in, vq->num_free); > > /* FIXME: for historical reasons, we force a notify > > here > > if > > * there are outgoing parts to the buffer. Presumably > > the > > * host should service the ring ASAP. */ > > if (out) > > vq->notify(&vq->vq); > > END_USE(vq); > > return -ENOSPC; > > } > > Rusty, could you pls clarify what are the historical reasons here? Are they still valid? If yes we could dedicate a feature flag to disabling this, or guess that the host is new by looking at some other feature flag. > More test results: > > UDP_STREAM test results (% is guest vcpu, guest has 2 vpus): > > Send(netperf) > ---- > > size 2.6.38-rc8 2.6.38-rc8+ 2.6.38-rc8 > addbuf failure check capacity > ----------------------------------------------------- > 1K 1541.0/50.14% 2169.1/50.03% 3018.9/50% > 2K 1649.7/33.74% 3362.6/50.18% 4518.8/50.47% > 4K 2957.8/44.83% 5965.9/50.03% 9592.5/50% > 8K 3788/39.01% 9852.8/50.25% 15483.8/50% > 16K 4736.1/34.13% 14946.5/50.01% 21645.0/50% Is this the local or remote throughput? With UDP_STREAM you are mostly interested in remote throughput, local one can be pretty high while most packets get dropped. > Looks like the additional guest notify in add_buf doesn't cost that much > than I thought to be. > > Thanks > Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Shirley Ma <mashirle@us.ibm.com> wrote: > > + /* Drop packet instead of stop queue for better performance */ I would like to see some justification as to why this is the right way to go and not just papering over the real problem. Thanks,
On Fri, 2011-03-18 at 15:15 +0200, Michael S. Tsirkin wrote: > Is this the local or remote throughput? > With UDP_STREAM you are mostly interested in > remote throughput, local one can be pretty high > while most packets get dropped. This is local throughput. Remote is called recv(netserver) data. With default wmem/rmem, there are so many recv errors. I haven't tuned the results yet. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2011-03-18 at 08:33 -0500, Herbert Xu wrote: > Shirley Ma <mashirle@us.ibm.com> wrote: > > > > + /* Drop packet instead of stop queue for better performance > */ > > I would like to see some justification as to why this is the right > way to go and not just papering over the real problem. Fair. KVM guest virtio_net TX queue stop/restart is pretty expensive, which involves: 1. Guest enable callback: one memory barrier, interrupt flag set 2. Host signals guest: one memory barrier, and a TX interrupt from host to KVM guest through evenfd_signal. Most of the workload so far we barely see TX over run, except for small messages TCP_STREAM. For small message size TCP_STREAM workload, no matter how big the TX queue size is, it always causes overrun. I even re-enable the TX queue when it's empty, it still hits TX overrun again and again. Somehow KVM guest and host is not in pace on processing small packets. I tried to pin each thread to different CPU, it didn't help. So it didn't seem to be scheduling related. >From the performance results, we can see dramatically performance gain with this patch. I would like to dig out the real reason why host can't in pace with guest, but haven't figured it out in month, that's the reason I held this patch for a while. However if anyone can give me any ideas on how to debug the real problem, I am willing to try it out. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2011-03-18 at 18:41 -0700, Shirley Ma wrote: > > > + /* Drop packet instead of stop queue for better > performance > > */ > > > > I would like to see some justification as to why this is the right > > way to go and not just papering over the real problem. > > Fair. KVM guest virtio_net TX queue stop/restart is pretty expensive, > which involves: > > 1. Guest enable callback: one memory barrier, interrupt flag set Missed this cost: for history reason, it also involves a guest exit from I/O write (PCI_QUEUE_NOTIFY). > 2. Host signals guest: one memory barrier, and a TX interrupt from > host > to KVM guest through evenfd_signal. > > > Most of the workload so far we barely see TX over run, except for > small > messages TCP_STREAM. > > For small message size TCP_STREAM workload, no matter how big the TX > queue size is, it always causes overrun. I even re-enable the TX queue > when it's empty, it still hits TX overrun again and again. > > Somehow KVM guest and host is not in pace on processing small packets. > I > tried to pin each thread to different CPU, it didn't help. So it > didn't > seem to be scheduling related. > > >From the performance results, we can see dramatically performance > gain > with this patch. > > I would like to dig out the real reason why host can't in pace with > guest, but haven't figured it out in month, that's the reason I held > this patch for a while. However if anyone can give me any ideas on how > to debug the real problem, I am willing to try it out. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 82dba5a..c603daa 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_net.h> +#include <linux/virtio_ring.h> #include <linux/scatterlist.h> #include <linux/if_vlan.h> #include <linux/slab.h> @@ -509,19 +510,18 @@ again: return received; } -static unsigned int free_old_xmit_skbs(struct virtnet_info *vi) +static void free_old_xmit_skbs(struct virtnet_info *vi) { struct sk_buff *skb; - unsigned int len, tot_sgs = 0; + unsigned int len; while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) { pr_debug("Sent skb %p\n", skb); vi->dev->stats.tx_bytes += skb->len; vi->dev->stats.tx_packets++; - tot_sgs += skb_vnet_hdr(skb)->num_sg; dev_kfree_skb_any(skb); } - return tot_sgs; + return; } static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) @@ -574,11 +574,26 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); + bool drop; + bool indirect = virtio_has_feature(vi->vdev, + VIRTIO_RING_F_INDIRECT_DESC); int capacity; /* Free up any pending old buffers before queueing new ones. */ free_old_xmit_skbs(vi); - + capacity = virtqueue_get_capacity(vi->svq); + + /* Drop packet instead of stop queue for better performance */ + drop = (capacity == 0 && indirect) || + ((capacity < MAX_SKB_FRAGS + 2) && !indirect); + if (unlikely(drop)) { + dev->stats.tx_fifo_errors++; + dev_warn(&dev->dev, "Unexpected TX queue failure: %d\n", + capacity); + dev->stats.tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; + } /* Try to transmit */ capacity = xmit_skb(vi, skb); @@ -605,20 +620,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) skb_orphan(skb); nf_reset(skb); - /* Apparently nice girls don't return TX_BUSY; stop the queue - * before it gets out of hand. Naturally, this wastes entries. */ - if (capacity < 2+MAX_SKB_FRAGS) { - netif_stop_queue(dev); - if (unlikely(!virtqueue_enable_cb(vi->svq))) { - /* More just got used, free them then recheck. */ - capacity += free_old_xmit_skbs(vi); - if (capacity >= 2+MAX_SKB_FRAGS) { - netif_start_queue(dev); - virtqueue_disable_cb(vi->svq); - } - } - } - return NETDEV_TX_OK; }
Signed-off-by: Shirley Ma <xma@us.ibm.com> --- drivers/net/virtio_net.c | 39 ++++++++++++++++++++------------------- 1 files changed, 20 insertions(+), 19 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html