Message ID | 1270229480.13897.8.camel@w-sridhar.beaverton.ibm.com |
---|---|
State | Not Applicable, archived |
Delegated to: | David Miller |
Headers | show |
On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote: > Make vhost scalable by creating a separate vhost thread per vhost > device. This provides better scaling across multiple guests and with > multiple interfaces in a guest. Thanks for looking into this. An alternative approach is to simply replace create_singlethread_workqueue with create_workqueue which would get us a thread per host CPU. It seems that in theory this should be the optimal approach wrt CPU locality, however, in practice a single thread seems to get better numbers. I have a TODO to investigate this. Could you try looking into this? > > I am seeing better aggregated througput/latency when running netperf > across multiple guests or multiple interfaces in a guest in parallel > with this patch. Any numbers? What happens to CPU utilization? > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index a6a88df..29aa80f 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -339,8 +339,10 @@ static int vhost_net_open(struct inode *inode, struct file *f) > return r; > } > > - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); > - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); > + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, > + &n->dev); > + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, > + &n->dev); > n->tx_poll_state = VHOST_NET_POLL_DISABLED; > > f->private_data = n; > @@ -643,25 +645,14 @@ static struct miscdevice vhost_net_misc = { > > int vhost_net_init(void) > { > - int r = vhost_init(); > - if (r) > - goto err_init; > - r = misc_register(&vhost_net_misc); > - if (r) > - goto err_reg; > - return 0; > -err_reg: > - vhost_cleanup(); > -err_init: > - return r; > - > + return misc_register(&vhost_net_misc); > } > + > module_init(vhost_net_init); > > void vhost_net_exit(void) > { > misc_deregister(&vhost_net_misc); > - vhost_cleanup(); > } > module_exit(vhost_net_exit); > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 7bd7a1e..243f4d3 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -36,8 +36,6 @@ enum { > VHOST_MEMORY_F_LOG = 0x1, > }; > > -static struct workqueue_struct *vhost_workqueue; > - > static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, > poll_table *pt) > { > @@ -56,18 +54,19 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, > if (!((unsigned long)key & poll->mask)) > return 0; > > - queue_work(vhost_workqueue, &poll->work); > + queue_work(poll->dev->wq, &poll->work); > return 0; > } > > /* Init poll structure */ > void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > - unsigned long mask) > + unsigned long mask, struct vhost_dev *dev) > { > INIT_WORK(&poll->work, func); > init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); > init_poll_funcptr(&poll->table, vhost_poll_func); > poll->mask = mask; > + poll->dev = dev; > } > > /* Start polling a file. We add ourselves to file's wait queue. The caller must > @@ -96,7 +95,7 @@ void vhost_poll_flush(struct vhost_poll *poll) > > void vhost_poll_queue(struct vhost_poll *poll) > { > - queue_work(vhost_workqueue, &poll->work); > + queue_work(poll->dev->wq, &poll->work); > } > > static void vhost_vq_reset(struct vhost_dev *dev, > @@ -128,6 +127,11 @@ long vhost_dev_init(struct vhost_dev *dev, > struct vhost_virtqueue *vqs, int nvqs) > { > int i; > + > + dev->wq = create_singlethread_workqueue("vhost"); > + if (!dev->wq) > + return -ENOMEM; > + > dev->vqs = vqs; > dev->nvqs = nvqs; > mutex_init(&dev->mutex); > @@ -143,7 +147,7 @@ long vhost_dev_init(struct vhost_dev *dev, > if (dev->vqs[i].handle_kick) > vhost_poll_init(&dev->vqs[i].poll, > dev->vqs[i].handle_kick, > - POLLIN); > + POLLIN, dev); > } > return 0; > } > @@ -216,6 +220,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev) > if (dev->mm) > mmput(dev->mm); > dev->mm = NULL; > + > + destroy_workqueue(dev->wq); > } > > static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) > @@ -1095,16 +1101,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq) > vq_err(vq, "Failed to enable notification at %p: %d\n", > &vq->used->flags, r); > } > - > -int vhost_init(void) > -{ > - vhost_workqueue = create_singlethread_workqueue("vhost"); > - if (!vhost_workqueue) > - return -ENOMEM; > - return 0; > -} > - > -void vhost_cleanup(void) > -{ > - destroy_workqueue(vhost_workqueue); > -} > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 44591ba..60fefd0 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -29,10 +29,11 @@ struct vhost_poll { > /* struct which will handle all actual work. */ > struct work_struct work; > unsigned long mask; > + struct vhost_dev *dev; > }; > > void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > - unsigned long mask); > + unsigned long mask, struct vhost_dev *dev); > void vhost_poll_start(struct vhost_poll *poll, struct file *file); > void vhost_poll_stop(struct vhost_poll *poll); > void vhost_poll_flush(struct vhost_poll *poll); > @@ -110,6 +111,7 @@ struct vhost_dev { > int nvqs; > struct file *log_file; > struct eventfd_ctx *log_ctx; > + struct workqueue_struct *wq; > }; > > long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); > @@ -136,9 +138,6 @@ bool vhost_enable_notify(struct vhost_virtqueue *); > int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, > unsigned int log_num, u64 len); > > -int vhost_init(void); > -void vhost_cleanup(void); > - > #define vq_err(vq, fmt, ...) do { \ > pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ > if ((vq)->error_ctx) \ > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote: > On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote: > > Make vhost scalable by creating a separate vhost thread per vhost > > device. This provides better scaling across multiple guests and with > > multiple interfaces in a guest. > > Thanks for looking into this. An alternative approach is > to simply replace create_singlethread_workqueue with > create_workqueue which would get us a thread per host CPU. > > It seems that in theory this should be the optimal approach > wrt CPU locality, however, in practice a single thread > seems to get better numbers. I have a TODO to investigate this. > Could you try looking into this? Yes. I tried using create_workqueue(), but the results were not good atleast when the number of guest interfaces is less than the number of CPUs. I didn't try more than 8 guests. Creating a separate thread per guest interface seems to be more scalable based on the testing i have done so far. I will try some more tests and get some numbers to compare the following 3 options. - single vhost thread - vhost thread per cpu - vhost thread per guest virtio interface Thanks Sridhar > > > > > I am seeing better aggregated througput/latency when running netperf > > across multiple guests or multiple interfaces in a guest in parallel > > with this patch. > > Any numbers? What happens to CPU utilization? > > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com> > > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > > index a6a88df..29aa80f 100644 > > --- a/drivers/vhost/net.c > > +++ b/drivers/vhost/net.c > > @@ -339,8 +339,10 @@ static int vhost_net_open(struct inode *inode, struct file *f) > > return r; > > } > > > > - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); > > - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); > > + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, > > + &n->dev); > > + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, > > + &n->dev); > > n->tx_poll_state = VHOST_NET_POLL_DISABLED; > > > > f->private_data = n; > > @@ -643,25 +645,14 @@ static struct miscdevice vhost_net_misc = { > > > > int vhost_net_init(void) > > { > > - int r = vhost_init(); > > - if (r) > > - goto err_init; > > - r = misc_register(&vhost_net_misc); > > - if (r) > > - goto err_reg; > > - return 0; > > -err_reg: > > - vhost_cleanup(); > > -err_init: > > - return r; > > - > > + return misc_register(&vhost_net_misc); > > } > > + > > module_init(vhost_net_init); > > > > void vhost_net_exit(void) > > { > > misc_deregister(&vhost_net_misc); > > - vhost_cleanup(); > > } > > module_exit(vhost_net_exit); > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > index 7bd7a1e..243f4d3 100644 > > --- a/drivers/vhost/vhost.c > > +++ b/drivers/vhost/vhost.c > > @@ -36,8 +36,6 @@ enum { > > VHOST_MEMORY_F_LOG = 0x1, > > }; > > > > -static struct workqueue_struct *vhost_workqueue; > > - > > static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, > > poll_table *pt) > > { > > @@ -56,18 +54,19 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, > > if (!((unsigned long)key & poll->mask)) > > return 0; > > > > - queue_work(vhost_workqueue, &poll->work); > > + queue_work(poll->dev->wq, &poll->work); > > return 0; > > } > > > > /* Init poll structure */ > > void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > > - unsigned long mask) > > + unsigned long mask, struct vhost_dev *dev) > > { > > INIT_WORK(&poll->work, func); > > init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); > > init_poll_funcptr(&poll->table, vhost_poll_func); > > poll->mask = mask; > > + poll->dev = dev; > > } > > > > /* Start polling a file. We add ourselves to file's wait queue. The caller must > > @@ -96,7 +95,7 @@ void vhost_poll_flush(struct vhost_poll *poll) > > > > void vhost_poll_queue(struct vhost_poll *poll) > > { > > - queue_work(vhost_workqueue, &poll->work); > > + queue_work(poll->dev->wq, &poll->work); > > } > > > > static void vhost_vq_reset(struct vhost_dev *dev, > > @@ -128,6 +127,11 @@ long vhost_dev_init(struct vhost_dev *dev, > > struct vhost_virtqueue *vqs, int nvqs) > > { > > int i; > > + > > + dev->wq = create_singlethread_workqueue("vhost"); > > + if (!dev->wq) > > + return -ENOMEM; > > + > > dev->vqs = vqs; > > dev->nvqs = nvqs; > > mutex_init(&dev->mutex); > > @@ -143,7 +147,7 @@ long vhost_dev_init(struct vhost_dev *dev, > > if (dev->vqs[i].handle_kick) > > vhost_poll_init(&dev->vqs[i].poll, > > dev->vqs[i].handle_kick, > > - POLLIN); > > + POLLIN, dev); > > } > > return 0; > > } > > @@ -216,6 +220,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev) > > if (dev->mm) > > mmput(dev->mm); > > dev->mm = NULL; > > + > > + destroy_workqueue(dev->wq); > > } > > > > static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) > > @@ -1095,16 +1101,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq) > > vq_err(vq, "Failed to enable notification at %p: %d\n", > > &vq->used->flags, r); > > } > > - > > -int vhost_init(void) > > -{ > > - vhost_workqueue = create_singlethread_workqueue("vhost"); > > - if (!vhost_workqueue) > > - return -ENOMEM; > > - return 0; > > -} > > - > > -void vhost_cleanup(void) > > -{ > > - destroy_workqueue(vhost_workqueue); > > -} > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > > index 44591ba..60fefd0 100644 > > --- a/drivers/vhost/vhost.h > > +++ b/drivers/vhost/vhost.h > > @@ -29,10 +29,11 @@ struct vhost_poll { > > /* struct which will handle all actual work. */ > > struct work_struct work; > > unsigned long mask; > > + struct vhost_dev *dev; > > }; > > > > void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > > - unsigned long mask); > > + unsigned long mask, struct vhost_dev *dev); > > void vhost_poll_start(struct vhost_poll *poll, struct file *file); > > void vhost_poll_stop(struct vhost_poll *poll); > > void vhost_poll_flush(struct vhost_poll *poll); > > @@ -110,6 +111,7 @@ struct vhost_dev { > > int nvqs; > > struct file *log_file; > > struct eventfd_ctx *log_ctx; > > + struct workqueue_struct *wq; > > }; > > > > long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); > > @@ -136,9 +138,6 @@ bool vhost_enable_notify(struct vhost_virtqueue *); > > int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, > > unsigned int log_num, u64 len); > > > > -int vhost_init(void); > > -void vhost_cleanup(void); > > - > > #define vq_err(vq, fmt, ...) do { \ > > pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ > > if ((vq)->error_ctx) \ > > > > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 04/05/2010 08:35 PM, Sridhar Samudrala wrote: > On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote: > >> On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote: >> >>> Make vhost scalable by creating a separate vhost thread per vhost >>> device. This provides better scaling across multiple guests and with >>> multiple interfaces in a guest. >>> >> Thanks for looking into this. An alternative approach is >> to simply replace create_singlethread_workqueue with >> create_workqueue which would get us a thread per host CPU. >> >> It seems that in theory this should be the optimal approach >> wrt CPU locality, however, in practice a single thread >> seems to get better numbers. I have a TODO to investigate this. >> Could you try looking into this? >> > Yes. I tried using create_workqueue(), but the results were not good > atleast when the number of guest interfaces is less than the number > of CPUs. I didn't try more than 8 guests. > Creating a separate thread per guest interface seems to be more > scalable based on the testing i have done so far. > Thread per guest is also easier to account. I'm worried about guests impacting other guests' performance outside scheduler control by extensive use of vhost.
On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote: > On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote: > > On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote: > > > Make vhost scalable by creating a separate vhost thread per vhost > > > device. This provides better scaling across multiple guests and with > > > multiple interfaces in a guest. > > > > Thanks for looking into this. An alternative approach is > > to simply replace create_singlethread_workqueue with > > create_workqueue which would get us a thread per host CPU. > > > > It seems that in theory this should be the optimal approach > > wrt CPU locality, however, in practice a single thread > > seems to get better numbers. I have a TODO to investigate this. > > Could you try looking into this? > > Yes. I tried using create_workqueue(), but the results were not good > atleast when the number of guest interfaces is less than the number > of CPUs. I didn't try more than 8 guests. > Creating a separate thread per guest interface seems to be more > scalable based on the testing i have done so far. > > I will try some more tests and get some numbers to compare the following > 3 options. > - single vhost thread > - vhost thread per cpu > - vhost thread per guest virtio interface Here are the results with netperf TCP_STREAM 64K guest to host on a 8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host CPU utilization. Current default single vhost thread ----------------------------------- 1 guest: 12500 37% 2 guests: 12800 46% 3 guests: 12600 47% 4 guests: 12200 47% 5 guests: 12000 47% 6 guests: 11700 47% 7 guests: 11340 47% 8 guests: 11200 48% vhost thread per cpu -------------------- 1 guest: 4900 25% 2 guests: 10800 49% 3 guests: 17100 67% 4 guests: 20400 84% 5 guests: 21000 90% 6 guests: 22500 92% 7 guests: 23500 96% 8 guests: 24500 99% vhost thread per guest interface -------------------------------- 1 guest: 12500 37% 2 guests: 21000 72% 3 guests: 21600 79% 4 guests: 21600 85% 5 guests: 22500 89% 6 guests: 22800 94% 7 guests: 24500 98% 8 guests: 26400 99% Thanks Sridhar -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Here are the results with netperf TCP_STREAM 64K guest to host on a > 8-cpu Nehalem system. I presume you mean 8 core Nehalem-EP, or did you mean 8 processor Nehalem-EX? Don't get me wrong, I *like* the netperf 64K TCP_STREAM test, I lik it a lot!-) but I find it incomplete and also like to run things like single-instance TCP_RR and multiple-instance, multiple "transaction" (./configure --enable-burst) TCP_RR tests, particularly when concerned with "scaling" issues. happy benchmarking, rick jones > It shows cumulative bandwidth in Mbps and host > CPU utilization. > > Current default single vhost thread > ----------------------------------- > 1 guest: 12500 37% > 2 guests: 12800 46% > 3 guests: 12600 47% > 4 guests: 12200 47% > 5 guests: 12000 47% > 6 guests: 11700 47% > 7 guests: 11340 47% > 8 guests: 11200 48% > > vhost thread per cpu > -------------------- > 1 guest: 4900 25% > 2 guests: 10800 49% > 3 guests: 17100 67% > 4 guests: 20400 84% > 5 guests: 21000 90% > 6 guests: 22500 92% > 7 guests: 23500 96% > 8 guests: 24500 99% > > vhost thread per guest interface > -------------------------------- > 1 guest: 12500 37% > 2 guests: 21000 72% > 3 guests: 21600 79% > 4 guests: 21600 85% > 5 guests: 22500 89% > 6 guests: 22800 94% > 7 guests: 24500 98% > 8 guests: 26400 99% > > Thanks > Sridhar > > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2010-04-08 at 17:14 -0700, Rick Jones wrote: > > Here are the results with netperf TCP_STREAM 64K guest to host on a > > 8-cpu Nehalem system. > > I presume you mean 8 core Nehalem-EP, or did you mean 8 processor Nehalem-EX? Yes. It is a 2 socket quad-core Nehalem. so i guess it is a 8 core Nehalem-EP. > > Don't get me wrong, I *like* the netperf 64K TCP_STREAM test, I lik it a lot!-) > but I find it incomplete and also like to run things like single-instance TCP_RR > and multiple-instance, multiple "transaction" (./configure --enable-burst) > TCP_RR tests, particularly when concerned with "scaling" issues. Can we run multiple instance and multiple transaction tests with a single netperf commandline? Is there any easy way to get consolidated throughput when a netserver on the host is servicing netperf clients from multiple guests? Thanks Sridhar > > happy benchmarking, > > rick jones > > > It shows cumulative bandwidth in Mbps and host > > CPU utilization. > > > > Current default single vhost thread > > ----------------------------------- > > 1 guest: 12500 37% > > 2 guests: 12800 46% > > 3 guests: 12600 47% > > 4 guests: 12200 47% > > 5 guests: 12000 47% > > 6 guests: 11700 47% > > 7 guests: 11340 47% > > 8 guests: 11200 48% > > > > vhost thread per cpu > > -------------------- > > 1 guest: 4900 25% > > 2 guests: 10800 49% > > 3 guests: 17100 67% > > 4 guests: 20400 84% > > 5 guests: 21000 90% > > 6 guests: 22500 92% > > 7 guests: 23500 96% > > 8 guests: 24500 99% > > > > vhost thread per guest interface > > -------------------------------- > > 1 guest: 12500 37% > > 2 guests: 21000 72% > > 3 guests: 21600 79% > > 4 guests: 21600 85% > > 5 guests: 22500 89% > > 6 guests: 22800 94% > > 7 guests: 24500 98% > > 8 guests: 26400 99% > > > > Thanks > > Sridhar > > > > > > -- > > To unsubscribe from this list: send the line "unsubscribe netdev" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala wrote: > On Thu, 2010-04-08 at 17:14 -0700, Rick Jones wrote: > >>>Here are the results with netperf TCP_STREAM 64K guest to host on a >>>8-cpu Nehalem system. >> >>I presume you mean 8 core Nehalem-EP, or did you mean 8 processor Nehalem-EX? > > > Yes. It is a 2 socket quad-core Nehalem. so i guess it is a 8 core > Nehalem-EP. > >>Don't get me wrong, I *like* the netperf 64K TCP_STREAM test, I lik it a lot!-) >>but I find it incomplete and also like to run things like single-instance TCP_RR >>and multiple-instance, multiple "transaction" (./configure --enable-burst) >>TCP_RR tests, particularly when concerned with "scaling" issues. > > > Can we run multiple instance and multiple transaction tests with a > single netperf commandline? Do you count a shell for loop as a single command line? > Is there any easy way to get consolidated throughput when a netserver on > the host is servicing netperf clients from multiple guests? I tend to use a script such as: ftp://ftp.netperf.org/netperf/misc/runemomniagg2.sh which presumes that netperf/netserver have been built with: ./configure --enable-omni --enable-burst ... and uses the CSV output format of the omni tests. When I want sums I then turn to a spreadsheet, or I suppose I could turn to awk etc. The TCP_RR test can be flipped around request size for response size etc, so when I have a single sustem under test, I initiate the netperf commands on it, targetting netservers on the clients. If I want inbound bulk throughput I use the TCP_MAERTS test rather than the TCP_STREAM test. happy benchmarking, rick jones -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Apr 08, 2010 at 05:05:42PM -0700, Sridhar Samudrala wrote: > On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote: > > On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote: > > > On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote: > > > > Make vhost scalable by creating a separate vhost thread per vhost > > > > device. This provides better scaling across multiple guests and with > > > > multiple interfaces in a guest. > > > > > > Thanks for looking into this. An alternative approach is > > > to simply replace create_singlethread_workqueue with > > > create_workqueue which would get us a thread per host CPU. > > > > > > It seems that in theory this should be the optimal approach > > > wrt CPU locality, however, in practice a single thread > > > seems to get better numbers. I have a TODO to investigate this. > > > Could you try looking into this? > > > > Yes. I tried using create_workqueue(), but the results were not good > > atleast when the number of guest interfaces is less than the number > > of CPUs. I didn't try more than 8 guests. > > Creating a separate thread per guest interface seems to be more > > scalable based on the testing i have done so far. > > > > I will try some more tests and get some numbers to compare the following > > 3 options. > > - single vhost thread > > - vhost thread per cpu > > - vhost thread per guest virtio interface > > Here are the results with netperf TCP_STREAM 64K guest to host on a > 8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host > CPU utilization. > > Current default single vhost thread > ----------------------------------- > 1 guest: 12500 37% > 2 guests: 12800 46% > 3 guests: 12600 47% > 4 guests: 12200 47% > 5 guests: 12000 47% > 6 guests: 11700 47% > 7 guests: 11340 47% > 8 guests: 11200 48% > > vhost thread per cpu > -------------------- > 1 guest: 4900 25% > 2 guests: 10800 49% > 3 guests: 17100 67% > 4 guests: 20400 84% > 5 guests: 21000 90% > 6 guests: 22500 92% > 7 guests: 23500 96% > 8 guests: 24500 99% > > vhost thread per guest interface > -------------------------------- > 1 guest: 12500 37% > 2 guests: 21000 72% > 3 guests: 21600 79% > 4 guests: 21600 85% > 5 guests: 22500 89% > 6 guests: 22800 94% > 7 guests: 24500 98% > 8 guests: 26400 99% > > Thanks > Sridhar Consider using Ingo's perf tool to get error bars, but looks good overall. One thing I note though is that we seem to be able to consume up to 99% CPU now. So I think with this approach we can no longer claim that we are just like some other parts of networking stack, doing work outside any cgroup, and we should make the vhost thread inherit the cgroup and cpu mask from the process calling SET_OWNER.
On Thu, Apr 08, 2010 at 05:05:42PM -0700, Sridhar Samudrala wrote: > On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote: > > On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote: > > > On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote: > > > > Make vhost scalable by creating a separate vhost thread per vhost > > > > device. This provides better scaling across multiple guests and with > > > > multiple interfaces in a guest. > > > > > > Thanks for looking into this. An alternative approach is > > > to simply replace create_singlethread_workqueue with > > > create_workqueue which would get us a thread per host CPU. > > > > > > It seems that in theory this should be the optimal approach > > > wrt CPU locality, however, in practice a single thread > > > seems to get better numbers. I have a TODO to investigate this. > > > Could you try looking into this? > > > > Yes. I tried using create_workqueue(), but the results were not good > > atleast when the number of guest interfaces is less than the number > > of CPUs. I didn't try more than 8 guests. > > Creating a separate thread per guest interface seems to be more > > scalable based on the testing i have done so far. > > > > I will try some more tests and get some numbers to compare the following > > 3 options. > > - single vhost thread > > - vhost thread per cpu > > - vhost thread per guest virtio interface > > Here are the results with netperf TCP_STREAM 64K guest to host on a > 8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host > CPU utilization. > > Current default single vhost thread > ----------------------------------- > 1 guest: 12500 37% > 2 guests: 12800 46% > 3 guests: 12600 47% > 4 guests: 12200 47% > 5 guests: 12000 47% > 6 guests: 11700 47% > 7 guests: 11340 47% > 8 guests: 11200 48% > > vhost thread per cpu > -------------------- > 1 guest: 4900 25% > 2 guests: 10800 49% > 3 guests: 17100 67% > 4 guests: 20400 84% > 5 guests: 21000 90% > 6 guests: 22500 92% > 7 guests: 23500 96% > 8 guests: 24500 99% > > vhost thread per guest interface > -------------------------------- > 1 guest: 12500 37% > 2 guests: 21000 72% > 3 guests: 21600 79% > 4 guests: 21600 85% > 5 guests: 22500 89% > 6 guests: 22800 94% > 7 guests: 24500 98% > 8 guests: 26400 99% We can also have a thread per vq. Does it help? > Thanks > Sridhar > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2010-04-11 at 18:47 +0300, Michael S. Tsirkin wrote: > On Thu, Apr 08, 2010 at 05:05:42PM -0700, Sridhar Samudrala wrote: > > On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote: > > > On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote: > > > > On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote: > > > > > Make vhost scalable by creating a separate vhost thread per vhost > > > > > device. This provides better scaling across multiple guests and with > > > > > multiple interfaces in a guest. > > > > > > > > Thanks for looking into this. An alternative approach is > > > > to simply replace create_singlethread_workqueue with > > > > create_workqueue which would get us a thread per host CPU. > > > > > > > > It seems that in theory this should be the optimal approach > > > > wrt CPU locality, however, in practice a single thread > > > > seems to get better numbers. I have a TODO to investigate this. > > > > Could you try looking into this? > > > > > > Yes. I tried using create_workqueue(), but the results were not good > > > atleast when the number of guest interfaces is less than the number > > > of CPUs. I didn't try more than 8 guests. > > > Creating a separate thread per guest interface seems to be more > > > scalable based on the testing i have done so far. > > > > > > I will try some more tests and get some numbers to compare the following > > > 3 options. > > > - single vhost thread > > > - vhost thread per cpu > > > - vhost thread per guest virtio interface > > > > Here are the results with netperf TCP_STREAM 64K guest to host on a > > 8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host > > CPU utilization. > > > > Current default single vhost thread > > ----------------------------------- > > 1 guest: 12500 37% > > 2 guests: 12800 46% > > 3 guests: 12600 47% > > 4 guests: 12200 47% > > 5 guests: 12000 47% > > 6 guests: 11700 47% > > 7 guests: 11340 47% > > 8 guests: 11200 48% > > > > vhost thread per cpu > > -------------------- > > 1 guest: 4900 25% > > 2 guests: 10800 49% > > 3 guests: 17100 67% > > 4 guests: 20400 84% > > 5 guests: 21000 90% > > 6 guests: 22500 92% > > 7 guests: 23500 96% > > 8 guests: 24500 99% > > > > vhost thread per guest interface > > -------------------------------- > > 1 guest: 12500 37% > > 2 guests: 21000 72% > > 3 guests: 21600 79% > > 4 guests: 21600 85% > > 5 guests: 22500 89% > > 6 guests: 22800 94% > > 7 guests: 24500 98% > > 8 guests: 26400 99% > > > > Thanks > > Sridhar > > > Consider using Ingo's perf tool to get error bars, but looks good > overall. What do you mean by getting error bars? > One thing I note though is that we seem to be able to > consume up to 99% CPU now. So I think with this approach > we can no longer claim that we are just like some other parts of > networking stack, doing work outside any cgroup, and we should > make the vhost thread inherit the cgroup and cpu mask > from the process calling SET_OWNER. Yes. I am not sure what is the right interface to do this, but this should also allow binding qemu to a set of cpus and automatically having vhost thread inherit the same cpu mask. Thanks Sridhar -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 12, 2010 at 10:35:31AM -0700, Sridhar Samudrala wrote: > On Sun, 2010-04-11 at 18:47 +0300, Michael S. Tsirkin wrote: > > On Thu, Apr 08, 2010 at 05:05:42PM -0700, Sridhar Samudrala wrote: > > > On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote: > > > > On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote: > > > > > On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote: > > > > > > Make vhost scalable by creating a separate vhost thread per vhost > > > > > > device. This provides better scaling across multiple guests and with > > > > > > multiple interfaces in a guest. > > > > > > > > > > Thanks for looking into this. An alternative approach is > > > > > to simply replace create_singlethread_workqueue with > > > > > create_workqueue which would get us a thread per host CPU. > > > > > > > > > > It seems that in theory this should be the optimal approach > > > > > wrt CPU locality, however, in practice a single thread > > > > > seems to get better numbers. I have a TODO to investigate this. > > > > > Could you try looking into this? > > > > > > > > Yes. I tried using create_workqueue(), but the results were not good > > > > atleast when the number of guest interfaces is less than the number > > > > of CPUs. I didn't try more than 8 guests. > > > > Creating a separate thread per guest interface seems to be more > > > > scalable based on the testing i have done so far. > > > > > > > > I will try some more tests and get some numbers to compare the following > > > > 3 options. > > > > - single vhost thread > > > > - vhost thread per cpu > > > > - vhost thread per guest virtio interface > > > > > > Here are the results with netperf TCP_STREAM 64K guest to host on a > > > 8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host > > > CPU utilization. > > > > > > Current default single vhost thread > > > ----------------------------------- > > > 1 guest: 12500 37% > > > 2 guests: 12800 46% > > > 3 guests: 12600 47% > > > 4 guests: 12200 47% > > > 5 guests: 12000 47% > > > 6 guests: 11700 47% > > > 7 guests: 11340 47% > > > 8 guests: 11200 48% > > > > > > vhost thread per cpu > > > -------------------- > > > 1 guest: 4900 25% > > > 2 guests: 10800 49% > > > 3 guests: 17100 67% > > > 4 guests: 20400 84% > > > 5 guests: 21000 90% > > > 6 guests: 22500 92% > > > 7 guests: 23500 96% > > > 8 guests: 24500 99% > > > > > > vhost thread per guest interface > > > -------------------------------- > > > 1 guest: 12500 37% > > > 2 guests: 21000 72% > > > 3 guests: 21600 79% > > > 4 guests: 21600 85% > > > 5 guests: 22500 89% > > > 6 guests: 22800 94% > > > 7 guests: 24500 98% > > > 8 guests: 26400 99% > > > > > > Thanks > > > Sridhar > > > > > > Consider using Ingo's perf tool to get error bars, but looks good > > overall. > > What do you mean by getting error bars? How noisy are the numbers? I'd like to see something along the lines of 85% +- 2% > > One thing I note though is that we seem to be able to > > consume up to 99% CPU now. So I think with this approach > > we can no longer claim that we are just like some other parts of > > networking stack, doing work outside any cgroup, and we should > > make the vhost thread inherit the cgroup and cpu mask > > from the process calling SET_OWNER. > > Yes. I am not sure what is the right interface to do this, I think we'll have to extend work queue API for this. > but this should also allow binding qemu to a set of cpus and > automatically having vhost thread inherit the same cpu mask. For numa, yes. Also need to inherit cgroup. > Thanks > Sridhar
>>> >>>Consider using Ingo's perf tool to get error bars, but looks good >>>overall. >> >>What do you mean by getting error bars? > > > How noisy are the numbers? > I'd like to see something along the lines of 85% +- 2% In netperf terms that would be adding the confidence intervals calculations to the results - which will be done by that "runemomniagg2.sh" script I mentioned. When running multiple instance tests, it is very important to set the min and max iterations to the same value so no instance thinks to finish early. The script does that, just want to make sure that those leveraging it do the same. happy benchmarking, rick jones -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index a6a88df..29aa80f 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -339,8 +339,10 @@ static int vhost_net_open(struct inode *inode, struct file *f) return r; } - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, + &n->dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, + &n->dev); n->tx_poll_state = VHOST_NET_POLL_DISABLED; f->private_data = n; @@ -643,25 +645,14 @@ static struct miscdevice vhost_net_misc = { int vhost_net_init(void) { - int r = vhost_init(); - if (r) - goto err_init; - r = misc_register(&vhost_net_misc); - if (r) - goto err_reg; - return 0; -err_reg: - vhost_cleanup(); -err_init: - return r; - + return misc_register(&vhost_net_misc); } + module_init(vhost_net_init); void vhost_net_exit(void) { misc_deregister(&vhost_net_misc); - vhost_cleanup(); } module_exit(vhost_net_exit); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 7bd7a1e..243f4d3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -36,8 +36,6 @@ enum { VHOST_MEMORY_F_LOG = 0x1, }; -static struct workqueue_struct *vhost_workqueue; - static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { @@ -56,18 +54,19 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, if (!((unsigned long)key & poll->mask)) return 0; - queue_work(vhost_workqueue, &poll->work); + queue_work(poll->dev->wq, &poll->work); return 0; } /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, work_func_t func, - unsigned long mask) + unsigned long mask, struct vhost_dev *dev) { INIT_WORK(&poll->work, func); init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; + poll->dev = dev; } /* Start polling a file. We add ourselves to file's wait queue. The caller must @@ -96,7 +95,7 @@ void vhost_poll_flush(struct vhost_poll *poll) void vhost_poll_queue(struct vhost_poll *poll) { - queue_work(vhost_workqueue, &poll->work); + queue_work(poll->dev->wq, &poll->work); } static void vhost_vq_reset(struct vhost_dev *dev, @@ -128,6 +127,11 @@ long vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue *vqs, int nvqs) { int i; + + dev->wq = create_singlethread_workqueue("vhost"); + if (!dev->wq) + return -ENOMEM; + dev->vqs = vqs; dev->nvqs = nvqs; mutex_init(&dev->mutex); @@ -143,7 +147,7 @@ long vhost_dev_init(struct vhost_dev *dev, if (dev->vqs[i].handle_kick) vhost_poll_init(&dev->vqs[i].poll, dev->vqs[i].handle_kick, - POLLIN); + POLLIN, dev); } return 0; } @@ -216,6 +220,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev) if (dev->mm) mmput(dev->mm); dev->mm = NULL; + + destroy_workqueue(dev->wq); } static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) @@ -1095,16 +1101,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq) vq_err(vq, "Failed to enable notification at %p: %d\n", &vq->used->flags, r); } - -int vhost_init(void) -{ - vhost_workqueue = create_singlethread_workqueue("vhost"); - if (!vhost_workqueue) - return -ENOMEM; - return 0; -} - -void vhost_cleanup(void) -{ - destroy_workqueue(vhost_workqueue); -} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 44591ba..60fefd0 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -29,10 +29,11 @@ struct vhost_poll { /* struct which will handle all actual work. */ struct work_struct work; unsigned long mask; + struct vhost_dev *dev; }; void vhost_poll_init(struct vhost_poll *poll, work_func_t func, - unsigned long mask); + unsigned long mask, struct vhost_dev *dev); void vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_flush(struct vhost_poll *poll); @@ -110,6 +111,7 @@ struct vhost_dev { int nvqs; struct file *log_file; struct eventfd_ctx *log_ctx; + struct workqueue_struct *wq; }; long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); @@ -136,9 +138,6 @@ bool vhost_enable_notify(struct vhost_virtqueue *); int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len); -int vhost_init(void); -void vhost_cleanup(void); - #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ if ((vq)->error_ctx) \
Make vhost scalable by creating a separate vhost thread per vhost device. This provides better scaling across multiple guests and with multiple interfaces in a guest. I am seeing better aggregated througput/latency when running netperf across multiple guests or multiple interfaces in a guest in parallel with this patch. Signed-off-by: Sridhar Samudrala <sri@us.ibm.com> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html