Message ID | 20100505205814.GA7090@redhat.com |
---|---|
State | New |
Headers | show |
On 05/05/2010 11:58 PM, Michael S. Tsirkin wrote: > Generally, the Host end of the virtio ring doesn't need to see where > Guest is up to in consuming the ring. However, to completely understand > what's going on from the outside, this information must be exposed. > For example, host can reduce the number of interrupts by detecting > that the guest is currently handling previous buffers. > > Fortunately, we have room to expand: the ring is always a whole number > of pages and there's hundreds of bytes of padding after the avail ring > and the used ring, whatever the number of descriptors (which must be a > power of 2). > > We add a feature bit so the guest can tell the host that it's writing > out the current value there, if it wants to use that. > > This is based on a patch by Rusty Russell, with the main difference > being that we dedicate a feature bit to guest to tell the host it is > writing the used index. This way we don't need to force host to publish > the last available index until we have a use for it. > > Signed-off-by: Rusty Russell<rusty@rustcorp.com.au> > Signed-off-by: Michael S. Tsirkin<mst@redhat.com> > --- > > Rusty, > this is a simplified form of a patch you posted in the past. > I have a vhost patch that, using this feature, shows external > to host bandwidth grow from 5 to 7 GB/s, by avoiding You mean external to guest I guess. We have a similar issue with virtio-blk - when using very fast multi-spindle storage on the host side, there are too many irq injection events. This patch should probably reduce them allot. The principle exactly matches the Xen ring. > an interrupt in the window after previous interrupt > was sent and before interrupts were disabled for the vq. > With vhost under some external to host loads I see > this window being hit about 30% sometimes. > > I'm finalizing the host bits and plan to send > the final version for inclusion when all's ready, > but I'd like to hear comments meanwhile. > > drivers/virtio/virtio_ring.c | 28 +++++++++++++++++----------- > include/linux/virtio_ring.h | 14 +++++++++++++- > 2 files changed, 30 insertions(+), 12 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 1ca8890..7729aba 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -89,9 +89,6 @@ struct vring_virtqueue > /* Number we've added since last sync. */ > unsigned int num_added; > > - /* Last used index we've seen. */ > - u16 last_used_idx; > - > /* How to notify other side. FIXME: commonalize hcalls! */ > void (*notify)(struct virtqueue *vq); > > @@ -285,12 +282,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) > > static inline bool more_used(const struct vring_virtqueue *vq) > { > - return vq->last_used_idx != vq->vring.used->idx; > + return *vq->vring.last_used_idx != vq->vring.used->idx; > } > > void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > { > struct vring_virtqueue *vq = to_vvq(_vq); > + struct vring_used_elem *u; > void *ret; > unsigned int i; > > @@ -307,12 +305,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > return NULL; > } > > - /* Only get used array entries after they have been exposed by host. */ > - virtio_rmb(); > - > - i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; > - *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; > + /* Only get used array entries after they have been exposed by host. > + * Need mb(), not just rmb() because we write last_used_idx below. */ > + virtio_mb(); > > + u =&vq->vring.used->ring[*vq->vring.last_used_idx % vq->vring.num]; > + i = u->id; > + *len = u->len; > if (unlikely(i>= vq->vring.num)) { > BAD_RING(vq, "id %u out of range\n", i); > return NULL; > @@ -325,7 +324,8 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > /* detach_buf clears data, so grab it now. */ > ret = vq->data[i]; > detach_buf(vq, i); > - vq->last_used_idx++; > + (*vq->vring.last_used_idx)++; > + > END_USE(vq); > return ret; > } > @@ -431,7 +431,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, > vq->vq.name = name; > vq->notify = notify; > vq->broken = false; > - vq->last_used_idx = 0; > + *vq->vring.last_used_idx = 0; > vq->num_added = 0; > list_add_tail(&vq->vq.list,&vdev->vqs); > #ifdef DEBUG > @@ -440,6 +440,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, > > vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); > > + /* We publish used index whether Host offers it or not: if not, it's > + * junk space anyway. But calling this acknowledges the feature. */ > + virtio_has_feature(vdev, VIRTIO_RING_F_PUBLISH_USED); > + > /* No callback? Tell other side not to bother us. */ > if (!callback) > vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; > @@ -473,6 +477,8 @@ void vring_transport_features(struct virtio_device *vdev) > switch (i) { > case VIRTIO_RING_F_INDIRECT_DESC: > break; > + case VIRTIO_RING_F_PUBLISH_INDICES: > + break; > default: > /* We don't understand this bit. */ > clear_bit(i, vdev->features); > diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h > index e4d144b..9d01de9 100644 > --- a/include/linux/virtio_ring.h > +++ b/include/linux/virtio_ring.h > @@ -29,6 +29,9 @@ > /* We support indirect buffer descriptors */ > #define VIRTIO_RING_F_INDIRECT_DESC 28 > > +/* The Guest publishes last-seen used index at the end of the avail ring. */ > +#define VIRTIO_RING_F_PUBLISH_USED 29 > + > /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ > struct vring_desc { > /* Address (guest-physical). */ > @@ -69,6 +72,8 @@ struct vring { > struct vring_avail *avail; > > struct vring_used *used; > + /* Last used index seen by the Guest. */ > + __u16 *last_used_idx; > }; > > /* The standard layout for the ring is a continuous chunk of memory which looks > @@ -83,6 +88,7 @@ struct vring { > * __u16 avail_flags; > * __u16 avail_idx; > * __u16 available[num]; > + * __u16 last_used_idx; > * > * // Padding to the next align boundary. > * char pad[]; > @@ -101,11 +107,17 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, > vr->avail = p + num*sizeof(struct vring_desc); > vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + align-1) > & ~(align - 1)); > + /* We publish the last-seen used index at the end of the available ring. > + * It is at the end for backwards compatibility. */ > + vr->last_used_idx =&(vr)->avail->ring[num]; > + /* Verify that last used index does not spill over the used ring. */ > + BUG_ON((void *)vr->last_used_idx + > + sizeof *vr->last_used_idx> (void *)vr->used); > } > > static inline unsigned vring_size(unsigned int num, unsigned long align) > { > - return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num) > + return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num) > + align - 1)& ~(align - 1)) > + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num; > }
On Thu, 6 May 2010 06:28:14 am Michael S. Tsirkin wrote: > Rusty, > this is a simplified form of a patch you posted in the past. > I have a vhost patch that, using this feature, shows external > to host bandwidth grow from 5 to 7 GB/s, by avoiding > an interrupt in the window after previous interrupt > was sent and before interrupts were disabled for the vq. > With vhost under some external to host loads I see > this window being hit about 30% sometimes. Fascinating. So you use this to guess if the guest is still processing? I haven't thought about it hard, but is that racy? Obviously happy to apply this when you finalize it. Thanks! Rusty.
On Thu, May 06, 2010 at 12:01:34PM +0930, Rusty Russell wrote: > On Thu, 6 May 2010 06:28:14 am Michael S. Tsirkin wrote: > > Rusty, > > this is a simplified form of a patch you posted in the past. > > I have a vhost patch that, using this feature, shows external > > to host bandwidth grow from 5 to 7 GB/s, by avoiding > > an interrupt in the window after previous interrupt > > was sent and before interrupts were disabled for the vq. > > With vhost under some external to host loads I see > > this window being hit about 30% sometimes. > > Fascinating. So you use this to guess if the guest is still processing? Exactly. > I haven't thought about it hard, but is that racy? I thought about this really hard and I don't think it's necessarily racy, as long as (pseudo code): guest: start: disable interrupts read(used) write(last used) enable interrupts mb() if read(used) goto start host: write used mb() if (reached(read(last used), used)) interrupt IOW, guest does read then write then read, host does write then read. Now, the only way to miss an interrupt is if we read last used value before it is written so we think guest is still processing. But if that happens, this means that host has written used before guest updated last used, and for this reason peek will see used value and restart polling. IOW, to miss an interrupt host must read a stale value. For this host must read before guest write, then host write already happened, so second read in guest will see an updated value host has written. Now, I also added an mb() in guest between read and write so that last used index write can not get ahead of used index read. It does feel good to have it there, but I can not say why it's helpful. Works fine without it, but then these subtle races might be hard to trigger. What do you think? > Obviously happy to apply this when you finalize it. > > Thanks! > Rusty.
On 05/05/2010 11:58 PM, Michael S. Tsirkin wrote: > + /* We publish the last-seen used index at the end of the available ring. > + * It is at the end for backwards compatibility. */ > + vr->last_used_idx =&(vr)->avail->ring[num]; > + /* Verify that last used index does not spill over the used ring. */ > + BUG_ON((void *)vr->last_used_idx + > + sizeof *vr->last_used_idx> (void *)vr->used); > } > Shouldn't this be on its own cache line? One way to achieve this is to allocate it at the end.
On Thu, 6 May 2010 07:30:00 pm Avi Kivity wrote: > On 05/05/2010 11:58 PM, Michael S. Tsirkin wrote: > > + /* We publish the last-seen used index at the end of the available ring. > > + * It is at the end for backwards compatibility. */ > > + vr->last_used_idx =&(vr)->avail->ring[num]; > > + /* Verify that last used index does not spill over the used ring. */ > > + BUG_ON((void *)vr->last_used_idx + > > + sizeof *vr->last_used_idx> (void *)vr->used); > > } > > > > Shouldn't this be on its own cache line? It's next to the available ring; because that's where the guest publishes its data. That whole page is guest-write, host-read. Putting it on a cacheline by itself would be a slight pessimization; the host cpu would have to get the last_used_idx cacheline and the avail descriptor cacheline every time. This way, they are sometimes the same cacheline. Hope that clarifies, Rusty.
On Thu, 6 May 2010 03:49:46 pm Michael S. Tsirkin wrote: > Now, I also added an mb() in guest between read and write so > that last used index write can not get ahead of used index read. > It does feel good to have it there, but I can not say why > it's helpful. Works fine without it, but then these > subtle races might be hard to trigger. What do you think? I couldn't see that in the patch? I don't think it's necessary though, since the write of depends last_used depends on the read of used (and no platform we care about would reorder such a thing). I'm reasonably happy, but we should write some convenient test for missing interrupts. I'm thinking of a sender which does a loop: blasts 1MB of UDP packets, then prints the time and sleep(1). The receiver would print the time every 1MB of received data. The two times should almost exactly correspond. Assuming that the network doesn't overflow and lose stuff, this should identify any missing wakeup/interrupts (depending on direction used). Cheers, Rusty.
On Fri, May 07, 2010 at 01:03:28PM +0930, Rusty Russell wrote: > On Thu, 6 May 2010 03:49:46 pm Michael S. Tsirkin wrote: > > Now, I also added an mb() in guest between read and write so > > that last used index write can not get ahead of used index read. > > It does feel good to have it there, but I can not say why > > it's helpful. Works fine without it, but then these > > subtle races might be hard to trigger. What do you think? > > I couldn't see that in the patch? I don't think it's necessary > though, since the write of depends last_used depends on the read of > used (and no platform we care about would reorder such a thing). Well, there's no data dependency, is there? > I'm reasonably happy, but we should write some convenient test for > missing interrupts. > > I'm thinking of a sender which does a loop: blasts 1MB of UDP packets, > then prints the time and sleep(1). The receiver would print the time > every 1MB of received data. The two times should almost exactly correspond. > > Assuming that the network doesn't overflow and lose stuff, this should > identify any missing wakeup/interrupts (depending on direction used). > > Cheers, > Rusty.
* Michael S. Tsirkin <mst@redhat.com> [2010-05-05 16:37]: > Generally, the Host end of the virtio ring doesn't need to see where > Guest is up to in consuming the ring. However, to completely understand > what's going on from the outside, this information must be exposed. > For example, host can reduce the number of interrupts by detecting > that the guest is currently handling previous buffers. > > Fortunately, we have room to expand: the ring is always a whole number > of pages and there's hundreds of bytes of padding after the avail ring > and the used ring, whatever the number of descriptors (which must be a > power of 2). > > We add a feature bit so the guest can tell the host that it's writing > out the current value there, if it wants to use that. > > This is based on a patch by Rusty Russell, with the main difference > being that we dedicate a feature bit to guest to tell the host it is > writing the used index. This way we don't need to force host to publish > the last available index until we have a use for it. > > Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > --- > > Rusty, > this is a simplified form of a patch you posted in the past. > I have a vhost patch that, using this feature, shows external > to host bandwidth grow from 5 to 7 GB/s, by avoiding > an interrupt in the window after previous interrupt > was sent and before interrupts were disabled for the vq. > With vhost under some external to host loads I see > this window being hit about 30% sometimes. > > I'm finalizing the host bits and plan to send > the final version for inclusion when all's ready, > but I'd like to hear comments meanwhile. > > drivers/virtio/virtio_ring.c | 28 +++++++++++++++++----------- > include/linux/virtio_ring.h | 14 +++++++++++++- > 2 files changed, 30 insertions(+), 12 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 1ca8890..7729aba 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -89,9 +89,6 @@ struct vring_virtqueue > /* Number we've added since last sync. */ > unsigned int num_added; > > - /* Last used index we've seen. */ > - u16 last_used_idx; > - > /* How to notify other side. FIXME: commonalize hcalls! */ > void (*notify)(struct virtqueue *vq); > > @@ -285,12 +282,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) > > static inline bool more_used(const struct vring_virtqueue *vq) > { > - return vq->last_used_idx != vq->vring.used->idx; > + return *vq->vring.last_used_idx != vq->vring.used->idx; > } > > void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > { > struct vring_virtqueue *vq = to_vvq(_vq); > + struct vring_used_elem *u; > void *ret; > unsigned int i; > > @@ -307,12 +305,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > return NULL; > } > > - /* Only get used array entries after they have been exposed by host. */ > - virtio_rmb(); > - > - i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; > - *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; > + /* Only get used array entries after they have been exposed by host. > + * Need mb(), not just rmb() because we write last_used_idx below. */ > + virtio_mb(); > > + u = &vq->vring.used->ring[*vq->vring.last_used_idx % vq->vring.num]; > + i = u->id; > + *len = u->len; > if (unlikely(i >= vq->vring.num)) { > BAD_RING(vq, "id %u out of range\n", i); > return NULL; > @@ -325,7 +324,8 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > /* detach_buf clears data, so grab it now. */ > ret = vq->data[i]; > detach_buf(vq, i); > - vq->last_used_idx++; > + (*vq->vring.last_used_idx)++; > + > END_USE(vq); > return ret; > } > @@ -431,7 +431,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, > vq->vq.name = name; > vq->notify = notify; > vq->broken = false; > - vq->last_used_idx = 0; > + *vq->vring.last_used_idx = 0; > vq->num_added = 0; > list_add_tail(&vq->vq.list, &vdev->vqs); > #ifdef DEBUG > @@ -440,6 +440,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, > > vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); > > + /* We publish used index whether Host offers it or not: if not, it's > + * junk space anyway. But calling this acknowledges the feature. */ > + virtio_has_feature(vdev, VIRTIO_RING_F_PUBLISH_USED); > + You use VIRTIO_RING_F_PUBLISH_USED here, but VIRTIO_RING_F_PUBLISH_INDICES below... > /* No callback? Tell other side not to bother us. */ > if (!callback) > vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; > @@ -473,6 +477,8 @@ void vring_transport_features(struct virtio_device *vdev) > switch (i) { > case VIRTIO_RING_F_INDIRECT_DESC: > break; > + case VIRTIO_RING_F_PUBLISH_INDICES: > + break; Here ^^^ > default: > /* We don't understand this bit. */ > clear_bit(i, vdev->features); > diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h > index e4d144b..9d01de9 100644 > --- a/include/linux/virtio_ring.h > +++ b/include/linux/virtio_ring.h > @@ -29,6 +29,9 @@ > /* We support indirect buffer descriptors */ > #define VIRTIO_RING_F_INDIRECT_DESC 28 > > +/* The Guest publishes last-seen used index at the end of the avail ring. */ > +#define VIRTIO_RING_F_PUBLISH_USED 29 > + And here; is it PUBLISHED_USED or PUBLISHED_INDICIES ?
On 05/07/2010 06:23 AM, Rusty Russell wrote: > On Thu, 6 May 2010 07:30:00 pm Avi Kivity wrote: > >> On 05/05/2010 11:58 PM, Michael S. Tsirkin wrote: >> >>> + /* We publish the last-seen used index at the end of the available ring. >>> + * It is at the end for backwards compatibility. */ >>> + vr->last_used_idx =&(vr)->avail->ring[num]; >>> + /* Verify that last used index does not spill over the used ring. */ >>> + BUG_ON((void *)vr->last_used_idx + >>> + sizeof *vr->last_used_idx> (void *)vr->used); >>> } >>> >>> >> Shouldn't this be on its own cache line? >> > It's next to the available ring; because that's where the guest publishes > its data. That whole page is guest-write, host-read. > > Putting it on a cacheline by itself would be a slight pessimization; the host > cpu would have to get the last_used_idx cacheline and the avail descriptor > cacheline every time. This way, they are sometimes the same cacheline. > If one peer writes the tail of the available ring, while the other reads last_used_idx, it's a false bounce, no? Having things on the same cacheline is only worthwhile if they are accessed at the same time.
On Tue, May 11, 2010 at 01:46:08PM -0500, Ryan Harper wrote: > * Michael S. Tsirkin <mst@redhat.com> [2010-05-05 16:37]: > > Generally, the Host end of the virtio ring doesn't need to see where > > Guest is up to in consuming the ring. However, to completely understand > > what's going on from the outside, this information must be exposed. > > For example, host can reduce the number of interrupts by detecting > > that the guest is currently handling previous buffers. > > > > Fortunately, we have room to expand: the ring is always a whole number > > of pages and there's hundreds of bytes of padding after the avail ring > > and the used ring, whatever the number of descriptors (which must be a > > power of 2). > > > > We add a feature bit so the guest can tell the host that it's writing > > out the current value there, if it wants to use that. > > > > This is based on a patch by Rusty Russell, with the main difference > > being that we dedicate a feature bit to guest to tell the host it is > > writing the used index. This way we don't need to force host to publish > > the last available index until we have a use for it. > > > > Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > --- > > > > Rusty, > > this is a simplified form of a patch you posted in the past. > > I have a vhost patch that, using this feature, shows external > > to host bandwidth grow from 5 to 7 GB/s, by avoiding > > an interrupt in the window after previous interrupt > > was sent and before interrupts were disabled for the vq. > > With vhost under some external to host loads I see > > this window being hit about 30% sometimes. > > > > I'm finalizing the host bits and plan to send > > the final version for inclusion when all's ready, > > but I'd like to hear comments meanwhile. > > > > drivers/virtio/virtio_ring.c | 28 +++++++++++++++++----------- > > include/linux/virtio_ring.h | 14 +++++++++++++- > > 2 files changed, 30 insertions(+), 12 deletions(-) > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > index 1ca8890..7729aba 100644 > > --- a/drivers/virtio/virtio_ring.c > > +++ b/drivers/virtio/virtio_ring.c > > @@ -89,9 +89,6 @@ struct vring_virtqueue > > /* Number we've added since last sync. */ > > unsigned int num_added; > > > > - /* Last used index we've seen. */ > > - u16 last_used_idx; > > - > > /* How to notify other side. FIXME: commonalize hcalls! */ > > void (*notify)(struct virtqueue *vq); > > > > @@ -285,12 +282,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) > > > > static inline bool more_used(const struct vring_virtqueue *vq) > > { > > - return vq->last_used_idx != vq->vring.used->idx; > > + return *vq->vring.last_used_idx != vq->vring.used->idx; > > } > > > > void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > > { > > struct vring_virtqueue *vq = to_vvq(_vq); > > + struct vring_used_elem *u; > > void *ret; > > unsigned int i; > > > > @@ -307,12 +305,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > > return NULL; > > } > > > > - /* Only get used array entries after they have been exposed by host. */ > > - virtio_rmb(); > > - > > - i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; > > - *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; > > + /* Only get used array entries after they have been exposed by host. > > + * Need mb(), not just rmb() because we write last_used_idx below. */ > > + virtio_mb(); > > > > + u = &vq->vring.used->ring[*vq->vring.last_used_idx % vq->vring.num]; > > + i = u->id; > > + *len = u->len; > > if (unlikely(i >= vq->vring.num)) { > > BAD_RING(vq, "id %u out of range\n", i); > > return NULL; > > @@ -325,7 +324,8 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > > /* detach_buf clears data, so grab it now. */ > > ret = vq->data[i]; > > detach_buf(vq, i); > > - vq->last_used_idx++; > > + (*vq->vring.last_used_idx)++; > > + > > END_USE(vq); > > return ret; > > } > > @@ -431,7 +431,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, > > vq->vq.name = name; > > vq->notify = notify; > > vq->broken = false; > > - vq->last_used_idx = 0; > > + *vq->vring.last_used_idx = 0; > > vq->num_added = 0; > > list_add_tail(&vq->vq.list, &vdev->vqs); > > #ifdef DEBUG > > @@ -440,6 +440,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, > > > > vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); > > > > + /* We publish used index whether Host offers it or not: if not, it's > > + * junk space anyway. But calling this acknowledges the feature. */ > > + virtio_has_feature(vdev, VIRTIO_RING_F_PUBLISH_USED); > > + > > You use VIRTIO_RING_F_PUBLISH_USED here, but > VIRTIO_RING_F_PUBLISH_INDICES below... > > > > /* No callback? Tell other side not to bother us. */ > > if (!callback) > > vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; > > @@ -473,6 +477,8 @@ void vring_transport_features(struct virtio_device *vdev) > > switch (i) { > > case VIRTIO_RING_F_INDIRECT_DESC: > > break; > > + case VIRTIO_RING_F_PUBLISH_INDICES: > > + break; > > Here ^^^ > > > default: > > /* We don't understand this bit. */ > > clear_bit(i, vdev->features); > > diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h > > index e4d144b..9d01de9 100644 > > --- a/include/linux/virtio_ring.h > > +++ b/include/linux/virtio_ring.h > > @@ -29,6 +29,9 @@ > > /* We support indirect buffer descriptors */ > > #define VIRTIO_RING_F_INDIRECT_DESC 28 > > > > +/* The Guest publishes last-seen used index at the end of the avail ring. */ > > +#define VIRTIO_RING_F_PUBLISH_USED 29 > > + > > And here; is it PUBLISHED_USED or PUBLISHED_INDICIES ? > > > -- > Ryan Harper > Software Engineer; Linux Technology Center > IBM Corp., Austin, Tx > ryanh@us.ibm.com Thanks, this is PUBLISHED_USED all over. This is fixed in the version I'm testing.
On Tue, May 11, 2010 at 10:27:22PM +0300, Avi Kivity wrote: > On 05/07/2010 06:23 AM, Rusty Russell wrote: >> On Thu, 6 May 2010 07:30:00 pm Avi Kivity wrote: >> >>> On 05/05/2010 11:58 PM, Michael S. Tsirkin wrote: >>> >>>> + /* We publish the last-seen used index at the end of the available ring. >>>> + * It is at the end for backwards compatibility. */ >>>> + vr->last_used_idx =&(vr)->avail->ring[num]; >>>> + /* Verify that last used index does not spill over the used ring. */ >>>> + BUG_ON((void *)vr->last_used_idx + >>>> + sizeof *vr->last_used_idx> (void *)vr->used); >>>> } >>>> >>>> >>> Shouldn't this be on its own cache line? >>> >> It's next to the available ring; because that's where the guest publishes >> its data. That whole page is guest-write, host-read. >> >> Putting it on a cacheline by itself would be a slight pessimization; the host >> cpu would have to get the last_used_idx cacheline and the avail descriptor >> cacheline every time. This way, they are sometimes the same cacheline. >> > > If one peer writes the tail of the available ring, while the other reads > last_used_idx, it's a false bounce, no? > > Having things on the same cacheline is only worthwhile if they are > accessed at the same time. Yes, this is what I was trying to say. avail flags and used index *are* accessed at the same time, so there could be an advantage to sharing a cache line there. All this should be kept in mind if we ever do VIRTIO_RING_F_NEW_LAYOUT.
On Wed, 12 May 2010 04:57:22 am Avi Kivity wrote: > On 05/07/2010 06:23 AM, Rusty Russell wrote: > > On Thu, 6 May 2010 07:30:00 pm Avi Kivity wrote: > > > >> On 05/05/2010 11:58 PM, Michael S. Tsirkin wrote: > >> > >>> + /* We publish the last-seen used index at the end of the available ring. > >>> + * It is at the end for backwards compatibility. */ > >>> + vr->last_used_idx =&(vr)->avail->ring[num]; > >>> + /* Verify that last used index does not spill over the used ring. */ > >>> + BUG_ON((void *)vr->last_used_idx + > >>> + sizeof *vr->last_used_idx> (void *)vr->used); > >>> } > >>> > >>> > >> Shouldn't this be on its own cache line? > >> > > It's next to the available ring; because that's where the guest publishes > > its data. That whole page is guest-write, host-read. > > > > Putting it on a cacheline by itself would be a slight pessimization; the host > > cpu would have to get the last_used_idx cacheline and the avail descriptor > > cacheline every time. This way, they are sometimes the same cacheline. > > If one peer writes the tail of the available ring, while the other reads > last_used_idx, it's a false bounce, no? I think we're talking about the last 2 entries of the avail ring. That means the worst case is 1 false bounce every time around the ring. I think that's why we're debating it instead of measuring it :) Note that this is a exclusive->shared->exclusive bounce only, too. Cheers, Rusty.
On 05/19/2010 10:39 AM, Rusty Russell wrote: > > I think we're talking about the last 2 entries of the avail ring. That means > the worst case is 1 false bounce every time around the ring. It's low, but why introduce an inefficiency when you can avoid doing it for the same effort? > I think that's > why we're debating it instead of measuring it :) > Measure before optimize is good for code but not for protocols. Protocols have to be robust against future changes. Virtio is warty enough already, we can't keep doing local optimizations. > Note that this is a exclusive->shared->exclusive bounce only, too. > A bounce is a bounce. Virtio is already way too bouncy due to the indirection between the avail/used rings and the descriptor pool. A device with out of order completion (like virtio-blk) will quickly randomize the unused descriptor indexes, so every descriptor fetch will require a bounce. In contrast, if the rings hold the descriptors themselves instead of pointers, we bounce (sizeof(descriptor)/cache_line_size) cache lines for every descriptor, amortized.
On Wed, May 19, 2010 at 11:06:42AM +0300, Avi Kivity wrote: > On 05/19/2010 10:39 AM, Rusty Russell wrote: >> >> I think we're talking about the last 2 entries of the avail ring. That means >> the worst case is 1 false bounce every time around the ring. > > It's low, but why introduce an inefficiency when you can avoid doing it > for the same effort? > >> I think that's >> why we're debating it instead of measuring it :) >> > > Measure before optimize is good for code but not for protocols. > Protocols have to be robust against future changes. Virtio is warty > enough already, we can't keep doing local optimizations. > >> Note that this is a exclusive->shared->exclusive bounce only, too. >> > > A bounce is a bounce. > > Virtio is already way too bouncy due to the indirection between the > avail/used rings and the descriptor pool. A device with out of order > completion (like virtio-blk) will quickly randomize the unused > descriptor indexes, so every descriptor fetch will require a bounce. > > In contrast, if the rings hold the descriptors themselves instead of > pointers, we bounce (sizeof(descriptor)/cache_line_size) cache lines for > every descriptor, amortized. On the other hand, consider that on fast path we are never using all of the ring. With a good allocator we might be able to keep reusing only small part of the ring, instead of wrapping around all of it all of the time. > -- > Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On Wed, 19 May 2010 05:36:42 pm Avi Kivity wrote: > > Note that this is a exclusive->shared->exclusive bounce only, too. > > > > A bounce is a bounce. I tried to measure this to show that you were wrong, but I was only able to show that you're right. How annoying. Test code below. > Virtio is already way too bouncy due to the indirection between the > avail/used rings and the descriptor pool. I tried to do a more careful analysis below, and I think this is an overstatement. > A device with out of order > completion (like virtio-blk) will quickly randomize the unused > descriptor indexes, so every descriptor fetch will require a bounce. > > In contrast, if the rings hold the descriptors themselves instead of > pointers, we bounce (sizeof(descriptor)/cache_line_size) cache lines for > every descriptor, amortized. We already have indirect, this would be a logical next step. So let's think about it. The avail ring would contain 64 bit values, the used ring would contain indexes into the avail ring. So client writes descriptor page and adds to avail ring, then writes to index. Server reads index, avail ring, descriptor page (3). Writes used entry (1). Updates last_used (1). Client reads used (1), derefs avail (1), updates last_used (1), cleans descriptor page (1). That's 9 cacheline transfers, worst case. Best case of a half-full ring in steady state, assuming 128-byte cache lines, the avail ring costs are 1/16, the used entry is 1/64. This drops it to 6 and 9/64 transfers. (Note, the current scheme adds 2 more cacheline transfers, for the descriptor table, worst case. Assuming indirect, we get 2/8 xfer best case. Either way, it's not the main source of cacheline xfers). Can we do better? The obvious idea is to try to get rid of last_used and used, and use the ring itself. We would use an invalid entry to mark the head of the ring. Any other thoughts? Rusty.
On Thu, 20 May 2010 02:31:50 pm Rusty Russell wrote: > On Wed, 19 May 2010 05:36:42 pm Avi Kivity wrote: > > > Note that this is a exclusive->shared->exclusive bounce only, too. > > > > > > > A bounce is a bounce. > > I tried to measure this to show that you were wrong, but I was only able > to show that you're right. How annoying. Test code below. This time for sure! #define _GNU_SOURCE #include <unistd.h> #include <sched.h> #include <err.h> #include <stdbool.h> #include <stdint.h> #include <string.h> #include <stdlib.h> #include <stdio.h> #include <sys/time.h> #include <sys/mman.h> /* We share memory via an mmap. */ struct counter { unsigned int cacheline1; char pad[256]; unsigned int cacheline2; }; #define MAX_BOUNCES 100000000 enum mode { SHARE, UNSHARE, LOCKSHARE, LOCKUNSHARE, }; int main(int argc, char *argv[]) { cpu_set_t cpuset; volatile struct counter *counter; struct timeval start, stop; bool child; unsigned int count; uint64_t usec; enum mode mode; if (argc != 4) errx(1, "Usage: cachebounce share|unshare|lockshare|lockunshare <cpu0> <cpu1>"); if (strcmp(argv[1], "share") == 0) mode = SHARE; else if (strcmp(argv[1], "unshare") == 0) mode = UNSHARE; else if (strcmp(argv[1], "lockshare") == 0) mode = LOCKSHARE; else if (strcmp(argv[1], "lockunshare") == 0) mode = LOCKSHARE; else errx(1, "Usage: cachebounce share|unshare|lockshare|lockunshare <cpu0> <cpu1>"); CPU_ZERO(&cpuset); counter = mmap(NULL, getpagesize(), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0); if (counter == MAP_FAILED) err(1, "Mapping page"); /* Fault it in. */ counter->cacheline1 = counter->cacheline2 = 0; child = (fork() == 0); CPU_SET(atoi(argv[2 + child]), &cpuset); if (sched_setaffinity(getpid(), sizeof(cpu_set_t), &cpuset) != 0) err(1, "Calling sched_setaffinity()"); gettimeofday(&start, NULL); if (child) { count = 1; switch (mode) { case SHARE: while (count < MAX_BOUNCES) { /* Spin waiting for other side to change it. */ while (counter->cacheline1 != count); count++; counter->cacheline1 = count; count++; } break; case UNSHARE: while (count < MAX_BOUNCES) { /* Spin waiting for other side to change it. */ while (counter->cacheline1 != count); count++; counter->cacheline2 = count; count++; } break; case LOCKSHARE: while (count < MAX_BOUNCES) { /* Spin waiting for other side to change it. */ while (__sync_val_compare_and_swap(&counter->cacheline1, count, count+1) != count); count += 2; } break; case LOCKUNSHARE: while (count < MAX_BOUNCES) { /* Spin waiting for other side to change it. */ while (counter->cacheline1 != count); __sync_val_compare_and_swap(&counter->cacheline2, count, count+1); count += 2; } break; } } else { count = 0; switch (mode) { case SHARE: while (count < MAX_BOUNCES) { /* Spin waiting for other side to change it. */ while (counter->cacheline1 != count); count++; counter->cacheline1 = count; count++; } break; case UNSHARE: while (count < MAX_BOUNCES) { /* Spin waiting for other side to change it. */ while (counter->cacheline2 != count); count++; counter->cacheline1 = count; count++; } break; case LOCKSHARE: while (count < MAX_BOUNCES) { /* Spin waiting for other side to change it. */ while (__sync_val_compare_and_swap(&counter->cacheline1, count, count+1) != count); count += 2; } break; case LOCKUNSHARE: while (count < MAX_BOUNCES) { /* Spin waiting for other side to change it. */ while (counter->cacheline2 != count); __sync_val_compare_and_swap(&counter->cacheline1, count, count+1); count += 2; } break; } } gettimeofday(&stop, NULL); usec = (stop.tv_sec * 1000000LL + stop.tv_usec) - (start.tv_sec * 1000000LL + start.tv_usec); printf("CPU %s: %s cacheline: %llu usec\n", argv[2+child], argv[1], usec); return 0; }
On 05/20/2010 01:33 AM, Michael S. Tsirkin wrote: > >> Virtio is already way too bouncy due to the indirection between the >> avail/used rings and the descriptor pool. A device with out of order >> completion (like virtio-blk) will quickly randomize the unused >> descriptor indexes, so every descriptor fetch will require a bounce. >> >> In contrast, if the rings hold the descriptors themselves instead of >> pointers, we bounce (sizeof(descriptor)/cache_line_size) cache lines for >> every descriptor, amortized. >> > On the other hand, consider that on fast path we are never using all > of the ring. With a good allocator we might be able to keep > reusing only small part of the ring, instead of wrapping around > all of it all of the time. > It's still suboptimal, we have to bounce both the avail/used rings and the descriptor pool, compared to just the descriptor ring with a direct design. Plus we don't need a fancy allocator. When amortizing cachelines, simple data structures win.
On 05/20/2010 08:01 AM, Rusty Russell wrote: > >> A device with out of order >> completion (like virtio-blk) will quickly randomize the unused >> descriptor indexes, so every descriptor fetch will require a bounce. >> >> In contrast, if the rings hold the descriptors themselves instead of >> pointers, we bounce (sizeof(descriptor)/cache_line_size) cache lines for >> every descriptor, amortized. >> > We already have indirect, this would be a logical next step. So let's > think about it. The avail ring would contain 64 bit values, the used ring > would contain indexes into the avail ring. > Have just one ring, no indexes. The producer places descriptors into the ring and updates the head, The consumer copies out descriptors to be processed and copies back in completed descriptors. Chaining is always linear. The descriptors contain a tag that allow the producer to identify the completion. Indirect only pays when there are enough descriptors in it to fill a couple of cache lines. Otherwise it's an extra bounce. We will always bounce here, that what happens when transferring data. The question is whether how many cache lines per descriptor. A pointer adds 1 bounce, linear descriptors cost 1/4 bounce, chained descriptors cost a bounce. So best is one ring of linearly chained descriptors. Indirect works when you have large requests (like block). > So client writes descriptor page and adds to avail ring, then writes to > index. > Server reads index, avail ring, descriptor page (3). Writes used > entry (1). Updates last_used (1). Client reads used (1), derefs avail (1), > updates last_used (1), cleans descriptor page (1). > > That's 9 cacheline transfers, worst case. Best case of a half-full ring > in steady state, assuming 128-byte cache lines, the avail ring costs are > 1/16, the used entry is 1/64. This drops it to 6 and 9/64 transfers. > Cache lines are 64 bytes these days. With a single ring, client writes descriptors (ceil(N/4)), updates head (1). Server reads head (1) copies out descriptors (ceil(N/4)), issues requests, copies back completions ((ceil(N/4)), updates tail (1). Client reads back tail and descriptors (1 + ceil(N/4)) Worst case: 4 + 4 * ceil(N/4). Best case I think this drops by half. > (Note, the current scheme adds 2 more cacheline transfers, for the descriptor > table, worst case. 2 bounces per descriptor due to random access. > Assuming indirect, we get 2/8 xfer best case. Either way, > it's not the main source of cacheline xfers). > Indirect adds a double bounce to get to the descriptor table, but any descriptors there are accessed linearly. It's only good when you have large chains. > Can we do better? The obvious idea is to try to get rid of last_used and > used, and use the ring itself. We would use an invalid entry to mark the > head of the ring. > Interesting! So a peer will read until it hits a wall. But how to update the wall atomically? Maybe we can have a flag in the descriptor indicate headness or tailness. Update looks ugly though: write descriptor with head flag, write next descriptor with head flag, remove flag from previous descriptor.
On Thu, May 20, 2010 at 02:31:50PM +0930, Rusty Russell wrote: > Can we do better? The obvious idea is to try to get rid of last_used and > used, and use the ring itself. We would use an invalid entry to mark the > head of the ring. > > Any other thoughts? > Rusty. We also need a way to avoid interrupts at least while we are processing the ring.
On Thu, 20 May 2010 04:30:56 pm Avi Kivity wrote: > On 05/20/2010 08:01 AM, Rusty Russell wrote: > > > >> A device with out of order > >> completion (like virtio-blk) will quickly randomize the unused > >> descriptor indexes, so every descriptor fetch will require a bounce. > >> > >> In contrast, if the rings hold the descriptors themselves instead of > >> pointers, we bounce (sizeof(descriptor)/cache_line_size) cache lines for > >> every descriptor, amortized. > >> > > We already have indirect, this would be a logical next step. So let's > > think about it. The avail ring would contain 64 bit values, the used ring > > would contain indexes into the avail ring. > > Have just one ring, no indexes. The producer places descriptors into > the ring and updates the head, The consumer copies out descriptors to > be processed and copies back in completed descriptors. Chaining is > always linear. The descriptors contain a tag that allow the producer to > identify the completion. This could definitely work. The original reason for the page boundaries was for untrusted inter-guest communication: with appropriate page protections they could see each other's rings and a simply inter-guest copy hypercall could verify that the other guest really exposed that data via virtio ring. But, cute as that is, we never did that. And it's not clear that it wins much over simply having the hypervisor read both rings directly. > > Can we do better? The obvious idea is to try to get rid of last_used and > > used, and use the ring itself. We would use an invalid entry to mark the > > head of the ring. > > Interesting! So a peer will read until it hits a wall. But how to > update the wall atomically? > > Maybe we can have a flag in the descriptor indicate headness or > tailness. Update looks ugly though: write descriptor with head flag, > write next descriptor with head flag, remove flag from previous descriptor. I was thinking a separate magic "invalid" entry. To publish an 3 descriptor chain, you would write descriptors 2 and 3, write an invalid entry at 4, barrier, write entry 1. It is a bit ugly, yes, but not terrible. I think that a simple simulator for this is worth writing, which tracks cacheline moves under various fullness scenarios... Cheers, Rusty.
On 05/20/2010 05:34 PM, Rusty Russell wrote: > >> Have just one ring, no indexes. The producer places descriptors into >> the ring and updates the head, The consumer copies out descriptors to >> be processed and copies back in completed descriptors. Chaining is >> always linear. The descriptors contain a tag that allow the producer to >> identify the completion. >> > This could definitely work. The original reason for the page boundaries > was for untrusted inter-guest communication: with appropriate page protections > they could see each other's rings and a simply inter-guest copy hypercall > could verify that the other guest really exposed that data via virtio ring. > > But, cute as that is, we never did that. And it's not clear that it wins > much over simply having the hypervisor read both rings directly. > AFAICS having separate avail_ring/used_ring/desc_pool is orthogonal to this cuteness. >>> Can we do better? The obvious idea is to try to get rid of last_used and >>> used, and use the ring itself. We would use an invalid entry to mark the >>> head of the ring. >>> >> Interesting! So a peer will read until it hits a wall. But how to >> update the wall atomically? >> >> Maybe we can have a flag in the descriptor indicate headness or >> tailness. Update looks ugly though: write descriptor with head flag, >> write next descriptor with head flag, remove flag from previous descriptor. >> > I was thinking a separate magic "invalid" entry. To publish an 3 descriptor > chain, you would write descriptors 2 and 3, write an invalid entry at 4, > barrier, write entry 1. It is a bit ugly, yes, but not terrible. > Worth exploring. This amortizes the indexes into the ring, a good thing. Another thing we can do is place the tail a half ring away from the head (and limit ring utilization to 50%), reducing bounces on short kicks. Or equivalently have an avail ring and used ring, but both containing tagged descriptors instead of pointers to descriptors. > I think that a simple simulator for this is worth writing, which tracks > cacheline moves under various fullness scenarios... > Yup.
On Thu, May 20, 2010 at 02:38:16PM +0930, Rusty Russell wrote: > On Thu, 20 May 2010 02:31:50 pm Rusty Russell wrote: > > On Wed, 19 May 2010 05:36:42 pm Avi Kivity wrote: > > > > Note that this is a exclusive->shared->exclusive bounce only, too. > > > > > > > > > > A bounce is a bounce. > > > > I tried to measure this to show that you were wrong, but I was only able > > to show that you're right. How annoying. Test code below. > > This time for sure! What do you see? On my laptop: [mst@tuck testring]$ ./rusty1 share 0 1 CPU 1: share cacheline: 2820410 usec CPU 0: share cacheline: 2823441 usec [mst@tuck testring]$ ./rusty1 unshare 0 1 CPU 0: unshare cacheline: 2783014 usec CPU 1: unshare cacheline: 2782951 usec [mst@tuck testring]$ ./rusty1 lockshare 0 1 CPU 1: lockshare cacheline: 1888495 usec CPU 0: lockshare cacheline: 1888544 usec [mst@tuck testring]$ ./rusty1 lockunshare 0 1 CPU 0: lockunshare cacheline: 1889854 usec CPU 1: lockunshare cacheline: 1889804 usec So locked version seems to be faster than unlocked, and share/unshare not to matter? same on a workstation: [root@qus19 ~]# ./rusty1 unshare 0 1 CPU 0: unshare cacheline: 6037002 usec CPU 1: unshare cacheline: 6036977 usec [root@qus19 ~]# ./rusty1 lockunshare 0 1 CPU 1: lockunshare cacheline: 5734362 usec CPU 0: lockunshare cacheline: 5734389 usec [root@qus19 ~]# ./rusty1 lockshare 0 1 CPU 1: lockshare cacheline: 5733537 usec CPU 0: lockshare cacheline: 5733564 usec using another pair of CPUs gives a more drastic results: [root@qus19 ~]# ./rusty1 lockshare 0 2 CPU 2: lockshare cacheline: 4226990 usec CPU 0: lockshare cacheline: 4227038 usec [root@qus19 ~]# ./rusty1 lockunshare 0 2 CPU 0: lockunshare cacheline: 4226707 usec CPU 2: lockunshare cacheline: 4226662 usec [root@qus19 ~]# ./rusty1 unshare 0 2 CPU 0: unshare cacheline: 14815048 usec CPU 2: unshare cacheline: 14815006 usec The share test seems to never finish on the workstation. I am debugging this.
On 05/23/2010 06:31 PM, Michael S. Tsirkin wrote: > On Thu, May 20, 2010 at 02:38:16PM +0930, Rusty Russell wrote: > >> On Thu, 20 May 2010 02:31:50 pm Rusty Russell wrote: >> >>> On Wed, 19 May 2010 05:36:42 pm Avi Kivity wrote: >>> >>>>> Note that this is a exclusive->shared->exclusive bounce only, too. >>>>> >>>>> >>>> A bounce is a bounce. >>>> >>> I tried to measure this to show that you were wrong, but I was only able >>> to show that you're right. How annoying. Test code below. >>> >> This time for sure! >> > > What do you see? > On my laptop: > [mst@tuck testring]$ ./rusty1 share 0 1 > CPU 1: share cacheline: 2820410 usec > CPU 0: share cacheline: 2823441 usec > [mst@tuck testring]$ ./rusty1 unshare 0 1 > CPU 0: unshare cacheline: 2783014 usec > CPU 1: unshare cacheline: 2782951 usec > [mst@tuck testring]$ ./rusty1 lockshare 0 1 > CPU 1: lockshare cacheline: 1888495 usec > CPU 0: lockshare cacheline: 1888544 usec > [mst@tuck testring]$ ./rusty1 lockunshare 0 1 > CPU 0: lockunshare cacheline: 1889854 usec > CPU 1: lockunshare cacheline: 1889804 usec > Ugh, can the timing be normalized per operation? This is unreadable. > So locked version seems to be faster than unlocked, > and share/unshare not to matter? > May be due to the processor using the LOCK operation as a hint to reserve the cacheline for a bit. > same on a workstation: > [root@qus19 ~]# ./rusty1 unshare 0 1 > CPU 0: unshare cacheline: 6037002 usec > CPU 1: unshare cacheline: 6036977 usec > [root@qus19 ~]# ./rusty1 lockunshare 0 1 > CPU 1: lockunshare cacheline: 5734362 usec > CPU 0: lockunshare cacheline: 5734389 usec > [root@qus19 ~]# ./rusty1 lockshare 0 1 > CPU 1: lockshare cacheline: 5733537 usec > CPU 0: lockshare cacheline: 5733564 usec > > using another pair of CPUs gives a more drastic > results: > > [root@qus19 ~]# ./rusty1 lockshare 0 2 > CPU 2: lockshare cacheline: 4226990 usec > CPU 0: lockshare cacheline: 4227038 usec > [root@qus19 ~]# ./rusty1 lockunshare 0 2 > CPU 0: lockunshare cacheline: 4226707 usec > CPU 2: lockunshare cacheline: 4226662 usec > [root@qus19 ~]# ./rusty1 unshare 0 2 > CPU 0: unshare cacheline: 14815048 usec > CPU 2: unshare cacheline: 14815006 usec > > That's expected. Hyperthread will be fastest (shared L1), shared L2/L3 will be slower, cross-socket will suck.
On Sun, May 23, 2010 at 06:41:33PM +0300, Avi Kivity wrote: > On 05/23/2010 06:31 PM, Michael S. Tsirkin wrote: >> On Thu, May 20, 2010 at 02:38:16PM +0930, Rusty Russell wrote: >> >>> On Thu, 20 May 2010 02:31:50 pm Rusty Russell wrote: >>> >>>> On Wed, 19 May 2010 05:36:42 pm Avi Kivity wrote: >>>> >>>>>> Note that this is a exclusive->shared->exclusive bounce only, too. >>>>>> >>>>>> >>>>> A bounce is a bounce. >>>>> >>>> I tried to measure this to show that you were wrong, but I was only able >>>> to show that you're right. How annoying. Test code below. >>>> >>> This time for sure! >>> >> >> What do you see? >> On my laptop: >> [mst@tuck testring]$ ./rusty1 share 0 1 >> CPU 1: share cacheline: 2820410 usec >> CPU 0: share cacheline: 2823441 usec >> [mst@tuck testring]$ ./rusty1 unshare 0 1 >> CPU 0: unshare cacheline: 2783014 usec >> CPU 1: unshare cacheline: 2782951 usec >> [mst@tuck testring]$ ./rusty1 lockshare 0 1 >> CPU 1: lockshare cacheline: 1888495 usec >> CPU 0: lockshare cacheline: 1888544 usec >> [mst@tuck testring]$ ./rusty1 lockunshare 0 1 >> CPU 0: lockunshare cacheline: 1889854 usec >> CPU 1: lockunshare cacheline: 1889804 usec >> > > Ugh, can the timing be normalized per operation? This is unreadable. > >> So locked version seems to be faster than unlocked, >> and share/unshare not to matter? >> > > May be due to the processor using the LOCK operation as a hint to > reserve the cacheline for a bit. Maybe we should use atomics on index then? >> same on a workstation: >> [root@qus19 ~]# ./rusty1 unshare 0 1 >> CPU 0: unshare cacheline: 6037002 usec >> CPU 1: unshare cacheline: 6036977 usec >> [root@qus19 ~]# ./rusty1 lockunshare 0 1 >> CPU 1: lockunshare cacheline: 5734362 usec >> CPU 0: lockunshare cacheline: 5734389 usec >> [root@qus19 ~]# ./rusty1 lockshare 0 1 >> CPU 1: lockshare cacheline: 5733537 usec >> CPU 0: lockshare cacheline: 5733564 usec >> >> using another pair of CPUs gives a more drastic >> results: >> >> [root@qus19 ~]# ./rusty1 lockshare 0 2 >> CPU 2: lockshare cacheline: 4226990 usec >> CPU 0: lockshare cacheline: 4227038 usec >> [root@qus19 ~]# ./rusty1 lockunshare 0 2 >> CPU 0: lockunshare cacheline: 4226707 usec >> CPU 2: lockunshare cacheline: 4226662 usec >> [root@qus19 ~]# ./rusty1 unshare 0 2 >> CPU 0: unshare cacheline: 14815048 usec >> CPU 2: unshare cacheline: 14815006 usec >> >> > > That's expected. Hyperthread will be fastest (shared L1), shared L2/L3 > will be slower, cross-socket will suck. OK, after adding mb in code patch will be sent separately, the test works for my workstation. locked is still fastest, unshared sometimes shows wins and sometimes loses over shared. [root@qus19 ~]# ./cachebounce share 0 1 CPU 0: share cacheline: 6638521 usec CPU 1: share cacheline: 6638478 usec [root@qus19 ~]# ./cachebounce unshare 0 1 CPU 0: unshare cacheline: 6037415 usec CPU 1: unshare cacheline: 6037374 usec [root@qus19 ~]# ./cachebounce lockshare 0 1 CPU 0: lockshare cacheline: 5734017 usec CPU 1: lockshare cacheline: 5733978 usec [root@qus19 ~]# ./cachebounce lockunshare 0 1 CPU 1: lockunshare cacheline: 5733260 usec CPU 0: lockunshare cacheline: 5733307 usec [root@qus19 ~]# ./cachebounce share 0 2 CPU 0: share cacheline: 14529198 usec CPU 2: share cacheline: 14529156 usec [root@qus19 ~]# ./cachebounce unshare 0 2 CPU 2: unshare cacheline: 14815328 usec CPU 0: unshare cacheline: 14815374 usec [root@qus19 ~]# ./cachebounce lockshare 0 2 CPU 0: lockshare cacheline: 4226878 usec CPU 2: lockshare cacheline: 4226842 usec [root@qus19 ~]# ./cachebounce locknushare 0 2 cachebounce: Usage: cachebounce share|unshare|lockshare|lockunshare <cpu0> <cpu1> [root@qus19 ~]# ./cachebounce lockunshare 0 2 CPU 0: lockunshare cacheline: 4227432 usec CPU 2: lockunshare cacheline: 4227375 usec
On 05/23/2010 06:51 PM, Michael S. Tsirkin wrote: >> >>> So locked version seems to be faster than unlocked, >>> and share/unshare not to matter? >>> >>> >> May be due to the processor using the LOCK operation as a hint to >> reserve the cacheline for a bit. >> > Maybe we should use atomics on index then? > This should only be helpful if you access the cacheline several times in a row. That's not the case in virtio (or here). I think the problem is that LOCKSHARE and SHARE are not symmetric, so they can't be directly compared. > OK, after adding mb in code patch will be sent separately, > the test works for my workstation. locked is still fastest, > unshared sometimes shows wins and sometimes loses over shared. > > [root@qus19 ~]# ./cachebounce share 0 1 > CPU 0: share cacheline: 6638521 usec > CPU 1: share cacheline: 6638478 usec > 66 ns? nice. > [root@qus19 ~]# ./cachebounce share 0 2 > CPU 0: share cacheline: 14529198 usec > CPU 2: share cacheline: 14529156 usec > 140 ns, not too bad. I hope I'm not misinterpreting the results.
On Sun, May 23, 2010 at 07:03:10PM +0300, Avi Kivity wrote: > On 05/23/2010 06:51 PM, Michael S. Tsirkin wrote: >>> >>>> So locked version seems to be faster than unlocked, >>>> and share/unshare not to matter? >>>> >>>> >>> May be due to the processor using the LOCK operation as a hint to >>> reserve the cacheline for a bit. >>> >> Maybe we should use atomics on index then? >> > > This should only be helpful if you access the cacheline several times in > a row. That's not the case in virtio (or here). So why does it help? > I think the problem is that LOCKSHARE and SHARE are not symmetric, so > they can't be directly compared. In what sense are they not symmetric? >> OK, after adding mb in code patch will be sent separately, >> the test works for my workstation. locked is still fastest, >> unshared sometimes shows wins and sometimes loses over shared. >> >> [root@qus19 ~]# ./cachebounce share 0 1 >> CPU 0: share cacheline: 6638521 usec >> CPU 1: share cacheline: 6638478 usec >> > > 66 ns? nice. > >> [root@qus19 ~]# ./cachebounce share 0 2 >> CPU 0: share cacheline: 14529198 usec >> CPU 2: share cacheline: 14529156 usec >> > > 140 ns, not too bad. I hope I'm not misinterpreting the results. > > -- > error compiling committee.c: too many arguments to function
On Sun, May 23, 2010 at 07:03:10PM +0300, Avi Kivity wrote: > On 05/23/2010 06:51 PM, Michael S. Tsirkin wrote: >>> >>>> So locked version seems to be faster than unlocked, >>>> and share/unshare not to matter? >>>> >>>> >>> May be due to the processor using the LOCK operation as a hint to >>> reserve the cacheline for a bit. >>> >> Maybe we should use atomics on index then? >> > > This should only be helpful if you access the cacheline several times in > a row. That's not the case in virtio (or here). > > I think the problem is that LOCKSHARE and SHARE are not symmetric, so > they can't be directly compared. > >> OK, after adding mb in code patch will be sent separately, >> the test works for my workstation. locked is still fastest, >> unshared sometimes shows wins and sometimes loses over shared. >> >> [root@qus19 ~]# ./cachebounce share 0 1 >> CPU 0: share cacheline: 6638521 usec >> CPU 1: share cacheline: 6638478 usec >> > > 66 ns? nice. > >> [root@qus19 ~]# ./cachebounce share 0 2 >> CPU 0: share cacheline: 14529198 usec >> CPU 2: share cacheline: 14529156 usec >> > > 140 ns, not too bad. I hope I'm not misinterpreting the results. > > -- > error compiling committee.c: too many arguments to function Here's another box: here the fastest option is shared, slowest unshared, lock is in the middle. [root@virtlab16 testring]# sh run 0 2 CPU 2: share cacheline: 3304728 usec CPU 0: share cacheline: 3304784 usec CPU 0: unshare cacheline: 6283248 usec CPU 2: unshare cacheline: 6283224 usec CPU 2: lockshare cacheline: 4018567 usec CPU 0: lockshare cacheline: 4018609 usec CPU 2: lockunshare cacheline: 4041791 usec CPU 0: lockunshare cacheline: 4041832 usec [root@virtlab16 testring]# [root@virtlab16 testring]# [root@virtlab16 testring]# [root@virtlab16 testring]# sh run 0 1 CPU 1: share cacheline: 8306326 usec CPU 0: share cacheline: 8306324 usec CPU 0: unshare cacheline: 19571697 usec CPU 1: unshare cacheline: 19571578 usec CPU 0: lockshare cacheline: 11281566 usec CPU 1: lockshare cacheline: 11281424 usec CPU 0: lockunshare cacheline: 11276093 usec CPU 1: lockunshare cacheline: 11275957 usec [root@virtlab16 testring]# sh run 0 3 CPU 0: share cacheline: 8288335 usec CPU 3: share cacheline: 8288334 usec CPU 0: unshare cacheline: 19107202 usec CPU 3: unshare cacheline: 19107139 usec CPU 0: lockshare cacheline: 11238915 usec CPU 3: lockshare cacheline: 11238848 usec CPU 3: lockunshare cacheline: 11132134 usec CPU 0: lockunshare cacheline: 11132249 usec
On 05/23/2010 07:30 PM, Michael S. Tsirkin wrote: > > >>> Maybe we should use atomics on index then? >>> >>> >> This should only be helpful if you access the cacheline several times in >> a row. That's not the case in virtio (or here). >> > So why does it help? > We actually do access the cacheline several times in a row here (but not in virtio?): > case SHARE: > while (count< MAX_BOUNCES) { > /* Spin waiting for other side to change it. */ > while (counter->cacheline1 != count); > Broadcast a read request. > count++; > counter->cacheline1 = count; > Broadcast an invalidate request. > count++; > } > break; > > case LOCKSHARE: > while (count< MAX_BOUNCES) { > /* Spin waiting for other side to change it. */ > while (__sync_val_compare_and_swap(&counter->cacheline1, count, count+1) > != count); > Broadcast a 'read for ownership' request. > count += 2; > } > break; > So RMW should certainly by faster using single-instruction RMW operations (or using prefetchw).
On Mon, May 24, 2010 at 09:37:05AM +0300, Avi Kivity wrote: > On 05/23/2010 07:30 PM, Michael S. Tsirkin wrote: >> >> >>>> Maybe we should use atomics on index then? >>>> >>>> >>> This should only be helpful if you access the cacheline several times in >>> a row. That's not the case in virtio (or here). >>> >> So why does it help? >> > > We actually do access the cacheline several times in a row here (but not > in virtio?): > >> case SHARE: >> while (count< MAX_BOUNCES) { >> /* Spin waiting for other side to change it. */ >> while (counter->cacheline1 != count); >> > > Broadcast a read request. > >> count++; >> counter->cacheline1 = count; >> > > Broadcast an invalidate request. > >> count++; >> } >> break; >> >> case LOCKSHARE: >> while (count< MAX_BOUNCES) { >> /* Spin waiting for other side to change it. */ >> while (__sync_val_compare_and_swap(&counter->cacheline1, count, count+1) >> != count); >> > > Broadcast a 'read for ownership' request. > >> count += 2; >> } >> break; >> > > So RMW should certainly by faster using single-instruction RMW > operations (or using prefetchw). Okay, but why is lockunshare faster than unshare? > -- > Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On 05/24/2010 11:05 AM, Michael S. Tsirkin wrote: > > Okay, but why is lockunshare faster than unshare? > > No idea.
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 1ca8890..7729aba 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -89,9 +89,6 @@ struct vring_virtqueue /* Number we've added since last sync. */ unsigned int num_added; - /* Last used index we've seen. */ - u16 last_used_idx; - /* How to notify other side. FIXME: commonalize hcalls! */ void (*notify)(struct virtqueue *vq); @@ -285,12 +282,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) static inline bool more_used(const struct vring_virtqueue *vq) { - return vq->last_used_idx != vq->vring.used->idx; + return *vq->vring.last_used_idx != vq->vring.used->idx; } void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) { struct vring_virtqueue *vq = to_vvq(_vq); + struct vring_used_elem *u; void *ret; unsigned int i; @@ -307,12 +305,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) return NULL; } - /* Only get used array entries after they have been exposed by host. */ - virtio_rmb(); - - i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; - *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; + /* Only get used array entries after they have been exposed by host. + * Need mb(), not just rmb() because we write last_used_idx below. */ + virtio_mb(); + u = &vq->vring.used->ring[*vq->vring.last_used_idx % vq->vring.num]; + i = u->id; + *len = u->len; if (unlikely(i >= vq->vring.num)) { BAD_RING(vq, "id %u out of range\n", i); return NULL; @@ -325,7 +324,8 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) /* detach_buf clears data, so grab it now. */ ret = vq->data[i]; detach_buf(vq, i); - vq->last_used_idx++; + (*vq->vring.last_used_idx)++; + END_USE(vq); return ret; } @@ -431,7 +431,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, vq->vq.name = name; vq->notify = notify; vq->broken = false; - vq->last_used_idx = 0; + *vq->vring.last_used_idx = 0; vq->num_added = 0; list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG @@ -440,6 +440,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); + /* We publish used index whether Host offers it or not: if not, it's + * junk space anyway. But calling this acknowledges the feature. */ + virtio_has_feature(vdev, VIRTIO_RING_F_PUBLISH_USED); + /* No callback? Tell other side not to bother us. */ if (!callback) vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; @@ -473,6 +477,8 @@ void vring_transport_features(struct virtio_device *vdev) switch (i) { case VIRTIO_RING_F_INDIRECT_DESC: break; + case VIRTIO_RING_F_PUBLISH_INDICES: + break; default: /* We don't understand this bit. */ clear_bit(i, vdev->features); diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index e4d144b..9d01de9 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -29,6 +29,9 @@ /* We support indirect buffer descriptors */ #define VIRTIO_RING_F_INDIRECT_DESC 28 +/* The Guest publishes last-seen used index at the end of the avail ring. */ +#define VIRTIO_RING_F_PUBLISH_USED 29 + /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ struct vring_desc { /* Address (guest-physical). */ @@ -69,6 +72,8 @@ struct vring { struct vring_avail *avail; struct vring_used *used; + /* Last used index seen by the Guest. */ + __u16 *last_used_idx; }; /* The standard layout for the ring is a continuous chunk of memory which looks @@ -83,6 +88,7 @@ struct vring { * __u16 avail_flags; * __u16 avail_idx; * __u16 available[num]; + * __u16 last_used_idx; * * // Padding to the next align boundary. * char pad[]; @@ -101,11 +107,17 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, vr->avail = p + num*sizeof(struct vring_desc); vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + align-1) & ~(align - 1)); + /* We publish the last-seen used index at the end of the available ring. + * It is at the end for backwards compatibility. */ + vr->last_used_idx = &(vr)->avail->ring[num]; + /* Verify that last used index does not spill over the used ring. */ + BUG_ON((void *)vr->last_used_idx + + sizeof *vr->last_used_idx > (void *)vr->used); } static inline unsigned vring_size(unsigned int num, unsigned long align) { - return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num) + return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num) + align - 1) & ~(align - 1)) + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num; }