diff mbox

[net-next,v7,9/9] xen-netback: Aggregate TX unmap operations

Message ID 1394142511-14827-10-git-send-email-zoltan.kiss@citrix.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Zoltan Kiss March 6, 2014, 9:48 p.m. UTC
Unmapping causes TLB flushing, therefore we should make it in the largest
possible batches. However we shouldn't starve the guest for too long. So if
the guest has space for at least two big packets and we don't have at least a
quarter ring to unmap, delay it for at most 1 milisec.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
v4:
- use bool for tx_dealloc_work_todo

v6:
- rebase tx_dealloc_work_todo due to missing ;

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Zoltan Kiss March 19, 2014, 9:16 p.m. UTC | #1
Hi,

I'm thinking about revoking this patch: it's value is pretty small, but 
it causes performance regression on Win7 guests. And probably it is not 
the best solution for this problem. It might be the delay it takes the 
dealloc thread to be scheduled is enough.
What do you think?

Zoli

On 06/03/14 21:48, Zoltan Kiss wrote:
> Unmapping causes TLB flushing, therefore we should make it in the largest
> possible batches. However we shouldn't starve the guest for too long. So if
> the guest has space for at least two big packets and we don't have at least a
> quarter ring to unmap, delay it for at most 1 milisec.
>
> Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
> ---
> v4:
> - use bool for tx_dealloc_work_todo
>
> v6:
> - rebase tx_dealloc_work_todo due to missing ;
>
> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
> index d1cd8ce..95498c8 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -118,6 +118,8 @@ struct xenvif {
>   	u16 dealloc_ring[MAX_PENDING_REQS];
>   	struct task_struct *dealloc_task;
>   	wait_queue_head_t dealloc_wq;
> +	struct timer_list dealloc_delay;
> +	bool dealloc_delay_timed_out;
>
>   	/* Use kthread for guest RX */
>   	struct task_struct *task;
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index 40aa500..f925af5 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -407,6 +407,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>   			  .desc = i };
>   		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
>   	}
> +	init_timer(&vif->dealloc_delay);
>
>   	/*
>   	 * Initialise a dummy MAC address. We choose the numerically
> @@ -557,6 +558,7 @@ void xenvif_disconnect(struct xenvif *vif)
>   	}
>
>   	if (vif->dealloc_task) {
> +		del_timer_sync(&vif->dealloc_delay);
>   		kthread_stop(vif->dealloc_task);
>   		vif->dealloc_task = NULL;
>   	}
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index bb65c7c..c098276 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -135,6 +135,11 @@ static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
>   		vif->pending_prod + vif->pending_cons;
>   }
>
> +static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring)
> +{
> +	return ring->nr_ents -	(ring->sring->req_prod - ring->rsp_prod_pvt);
> +}
> +
>   bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)
>   {
>   	RING_IDX prod, cons;
> @@ -1932,9 +1937,36 @@ static inline int tx_work_todo(struct xenvif *vif)
>   	return 0;
>   }
>
> +static void xenvif_dealloc_delay(unsigned long data)
> +{
> +	struct xenvif *vif = (struct xenvif *)data;
> +
> +	vif->dealloc_delay_timed_out = true;
> +	wake_up(&vif->dealloc_wq);
> +}
> +
>   static inline bool tx_dealloc_work_todo(struct xenvif *vif)
>   {
> -	return vif->dealloc_cons != vif->dealloc_prod;
> +	if (vif->dealloc_cons != vif->dealloc_prod) {
> +		if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) &&
> +		    (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) &&
> +		    !vif->dealloc_delay_timed_out) {
> +			if (!timer_pending(&vif->dealloc_delay)) {
> +				vif->dealloc_delay.function =
> +					xenvif_dealloc_delay;
> +				vif->dealloc_delay.data = (unsigned long)vif;
> +				mod_timer(&vif->dealloc_delay,
> +					  jiffies + msecs_to_jiffies(1));
> +
> +			}
> +			return false;
> +		}
> +		del_timer_sync(&vif->dealloc_delay);
> +		vif->dealloc_delay_timed_out = false;
> +		return true;
> +	}
> +
> +	return false;
>   }
>
>   void xenvif_unmap_frontend_rings(struct xenvif *vif)
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Durrant March 20, 2014, 9:53 a.m. UTC | #2
> -----Original Message-----
> From: Zoltan Kiss
> Sent: 19 March 2014 21:16
> To: Ian Campbell; Wei Liu; xen-devel@lists.xenproject.org
> Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Jonathan Davies;
> Paul Durrant
> Subject: Re: [PATCH net-next v7 9/9] xen-netback: Aggregate TX unmap
> operations
> 
> Hi,
> 
> I'm thinking about revoking this patch: it's value is pretty small, but
> it causes performance regression on Win7 guests. And probably it is not
> the best solution for this problem. It might be the delay it takes the
> dealloc thread to be scheduled is enough.
> What do you think?
> 

Yes, I think we need a revert to fix the performance regression. As I understand things, it's sufficiently bad that we would not want to take the grant mapping series into XenServer without the reversion.

  Paul

> Zoli
> 
> On 06/03/14 21:48, Zoltan Kiss wrote:
> > Unmapping causes TLB flushing, therefore we should make it in the largest
> > possible batches. However we shouldn't starve the guest for too long. So if
> > the guest has space for at least two big packets and we don't have at least a
> > quarter ring to unmap, delay it for at most 1 milisec.
> >
> > Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
> > ---
> > v4:
> > - use bool for tx_dealloc_work_todo
> >
> > v6:
> > - rebase tx_dealloc_work_todo due to missing ;
> >
> > diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-
> netback/common.h
> > index d1cd8ce..95498c8 100644
> > --- a/drivers/net/xen-netback/common.h
> > +++ b/drivers/net/xen-netback/common.h
> > @@ -118,6 +118,8 @@ struct xenvif {
> >   	u16 dealloc_ring[MAX_PENDING_REQS];
> >   	struct task_struct *dealloc_task;
> >   	wait_queue_head_t dealloc_wq;
> > +	struct timer_list dealloc_delay;
> > +	bool dealloc_delay_timed_out;
> >
> >   	/* Use kthread for guest RX */
> >   	struct task_struct *task;
> > diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-
> netback/interface.c
> > index 40aa500..f925af5 100644
> > --- a/drivers/net/xen-netback/interface.c
> > +++ b/drivers/net/xen-netback/interface.c
> > @@ -407,6 +407,7 @@ struct xenvif *xenvif_alloc(struct device *parent,
> domid_t domid,
> >   			  .desc = i };
> >   		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
> >   	}
> > +	init_timer(&vif->dealloc_delay);
> >
> >   	/*
> >   	 * Initialise a dummy MAC address. We choose the numerically
> > @@ -557,6 +558,7 @@ void xenvif_disconnect(struct xenvif *vif)
> >   	}
> >
> >   	if (vif->dealloc_task) {
> > +		del_timer_sync(&vif->dealloc_delay);
> >   		kthread_stop(vif->dealloc_task);
> >   		vif->dealloc_task = NULL;
> >   	}
> > diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-
> netback/netback.c
> > index bb65c7c..c098276 100644
> > --- a/drivers/net/xen-netback/netback.c
> > +++ b/drivers/net/xen-netback/netback.c
> > @@ -135,6 +135,11 @@ static inline pending_ring_idx_t
> nr_pending_reqs(struct xenvif *vif)
> >   		vif->pending_prod + vif->pending_cons;
> >   }
> >
> > +static inline pending_ring_idx_t nr_free_slots(struct
> xen_netif_tx_back_ring *ring)
> > +{
> > +	return ring->nr_ents -	(ring->sring->req_prod - ring-
> >rsp_prod_pvt);
> > +}
> > +
> >   bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)
> >   {
> >   	RING_IDX prod, cons;
> > @@ -1932,9 +1937,36 @@ static inline int tx_work_todo(struct xenvif *vif)
> >   	return 0;
> >   }
> >
> > +static void xenvif_dealloc_delay(unsigned long data)
> > +{
> > +	struct xenvif *vif = (struct xenvif *)data;
> > +
> > +	vif->dealloc_delay_timed_out = true;
> > +	wake_up(&vif->dealloc_wq);
> > +}
> > +
> >   static inline bool tx_dealloc_work_todo(struct xenvif *vif)
> >   {
> > -	return vif->dealloc_cons != vif->dealloc_prod;
> > +	if (vif->dealloc_cons != vif->dealloc_prod) {
> > +		if ((nr_free_slots(&vif->tx) > 2 *
> XEN_NETBK_LEGACY_SLOTS_MAX) &&
> > +		    (vif->dealloc_prod - vif->dealloc_cons <
> MAX_PENDING_REQS / 4) &&
> > +		    !vif->dealloc_delay_timed_out) {
> > +			if (!timer_pending(&vif->dealloc_delay)) {
> > +				vif->dealloc_delay.function =
> > +					xenvif_dealloc_delay;
> > +				vif->dealloc_delay.data = (unsigned long)vif;
> > +				mod_timer(&vif->dealloc_delay,
> > +					  jiffies + msecs_to_jiffies(1));
> > +
> > +			}
> > +			return false;
> > +		}
> > +		del_timer_sync(&vif->dealloc_delay);
> > +		vif->dealloc_delay_timed_out = false;
> > +		return true;
> > +	}
> > +
> > +	return false;
> >   }
> >
> >   void xenvif_unmap_frontend_rings(struct xenvif *vif)
> >

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Liu March 20, 2014, 10:48 a.m. UTC | #3
On Wed, Mar 19, 2014 at 09:16:05PM +0000, Zoltan Kiss wrote:
> Hi,
> 
> I'm thinking about revoking this patch: it's value is pretty small,
> but it causes performance regression on Win7 guests. And probably it
> is not the best solution for this problem. It might be the delay it
> takes the dealloc thread to be scheduled is enough.
> What do you think?
> 

Can you elaborate? What makes Win7 so special? What's performance
impact to other guests?

Wei.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Durrant March 20, 2014, 11:14 a.m. UTC | #4
> -----Original Message-----
> From: Wei Liu [mailto:wei.liu2@citrix.com]
> Sent: 20 March 2014 10:49
> To: Zoltan Kiss
> Cc: Ian Campbell; Wei Liu; xen-devel@lists.xenproject.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Jonathan Davies;
> Paul Durrant
> Subject: Re: [PATCH net-next v7 9/9] xen-netback: Aggregate TX unmap
> operations
> 
> On Wed, Mar 19, 2014 at 09:16:05PM +0000, Zoltan Kiss wrote:
> > Hi,
> >
> > I'm thinking about revoking this patch: it's value is pretty small,
> > but it causes performance regression on Win7 guests. And probably it
> > is not the best solution for this problem. It might be the delay it
> > takes the dealloc thread to be scheduled is enough.
> > What do you think?
> >
> 
> Can you elaborate? What makes Win7 so special? What's performance
> impact to other guests?
> 

It won't be Win7 specifically I expect. It will likely by any version of Windows, or any other OS that limits the TXs-in-flight so aggressively. Basically you need to TX-complete reasonably frequently otherwise your throughput drops off a lot. IIRC at Solarflare we found every ~500us to be just about frequent enough for hitting 10G.

  Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Liu March 20, 2014, 12:38 p.m. UTC | #5
On Thu, Mar 20, 2014 at 11:14:51AM +0000, Paul Durrant wrote:
> > -----Original Message-----
> > From: Wei Liu [mailto:wei.liu2@citrix.com]
> > Sent: 20 March 2014 10:49
> > To: Zoltan Kiss
> > Cc: Ian Campbell; Wei Liu; xen-devel@lists.xenproject.org;
> > netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Jonathan Davies;
> > Paul Durrant
> > Subject: Re: [PATCH net-next v7 9/9] xen-netback: Aggregate TX unmap
> > operations
> > 
> > On Wed, Mar 19, 2014 at 09:16:05PM +0000, Zoltan Kiss wrote:
> > > Hi,
> > >
> > > I'm thinking about revoking this patch: it's value is pretty small,
> > > but it causes performance regression on Win7 guests. And probably it
> > > is not the best solution for this problem. It might be the delay it
> > > takes the dealloc thread to be scheduled is enough.
> > > What do you think?
> > >
> > 
> > Can you elaborate? What makes Win7 so special? What's performance
> > impact to other guests?
> > 
> 
> It won't be Win7 specifically I expect. It will likely by any version
> of Windows, or any other OS that limits the TXs-in-flight so
> aggressively. Basically you need to TX-complete reasonably frequently
> otherwise your throughput drops off a lot. IIRC at Solarflare we found
> every ~500us to be just about frequent enough for hitting 10G.

Thanks for the explanation.

Reverting this change basically means when to flush TLB is at sole
discretion of Linux kernel scheduler. I don't oppose to that. But it
would be better to provide some numbers.

Wei.

> 
>   Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss March 20, 2014, 4:11 p.m. UTC | #6
On 20/03/14 12:38, Wei Liu wrote:
> On Thu, Mar 20, 2014 at 11:14:51AM +0000, Paul Durrant wrote:
>>> -----Original Message-----
>>> From: Wei Liu [mailto:wei.liu2@citrix.com]
>>> Sent: 20 March 2014 10:49
>>> To: Zoltan Kiss
>>> Cc: Ian Campbell; Wei Liu; xen-devel@lists.xenproject.org;
>>> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Jonathan Davies;
>>> Paul Durrant
>>> Subject: Re: [PATCH net-next v7 9/9] xen-netback: Aggregate TX unmap
>>> operations
>>>
>>> On Wed, Mar 19, 2014 at 09:16:05PM +0000, Zoltan Kiss wrote:
>>>> Hi,
>>>>
>>>> I'm thinking about revoking this patch: it's value is pretty small,
>>>> but it causes performance regression on Win7 guests. And probably it
>>>> is not the best solution for this problem. It might be the delay it
>>>> takes the dealloc thread to be scheduled is enough.
>>>> What do you think?
>>>>
>>>
>>> Can you elaborate? What makes Win7 so special? What's performance
>>> impact to other guests?
>>>
>>
>> It won't be Win7 specifically I expect. It will likely by any version
>> of Windows, or any other OS that limits the TXs-in-flight so
>> aggressively. Basically you need to TX-complete reasonably frequently
>> otherwise your throughput drops off a lot. IIRC at Solarflare we found
>> every ~500us to be just about frequent enough for hitting 10G.
>
> Thanks for the explanation.
>
> Reverting this change basically means when to flush TLB is at sole
> discretion of Linux kernel scheduler. I don't oppose to that. But it
> would be better to provide some numbers.

My comparisons with iperf haven't showed any significant difference. 
I've measured Win7 and upstream Linux guest.
There was a misunderstanding that reverting this patch would stop 
batching of unmap. There would be still batching as when the callback 
wakes the dealloc thread, by the time it happens other callbacks still 
can place work on the dealloc ring, even while the thread started to 
process them. And that could happen independently from the TX operations 
in the NAPI instance, which is an another good feature of having a 
dealloc thread.

I've discussed this in person with Paul and Ian as well, they are happy 
with the reverting. So David, can you please revert e9275f5e2d 
"xen-netback: Aggregate TX unmap operations"?

Regards,

Zoli
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index d1cd8ce..95498c8 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -118,6 +118,8 @@  struct xenvif {
 	u16 dealloc_ring[MAX_PENDING_REQS];
 	struct task_struct *dealloc_task;
 	wait_queue_head_t dealloc_wq;
+	struct timer_list dealloc_delay;
+	bool dealloc_delay_timed_out;
 
 	/* Use kthread for guest RX */
 	struct task_struct *task;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 40aa500..f925af5 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -407,6 +407,7 @@  struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 			  .desc = i };
 		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
 	}
+	init_timer(&vif->dealloc_delay);
 
 	/*
 	 * Initialise a dummy MAC address. We choose the numerically
@@ -557,6 +558,7 @@  void xenvif_disconnect(struct xenvif *vif)
 	}
 
 	if (vif->dealloc_task) {
+		del_timer_sync(&vif->dealloc_delay);
 		kthread_stop(vif->dealloc_task);
 		vif->dealloc_task = NULL;
 	}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index bb65c7c..c098276 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -135,6 +135,11 @@  static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
 		vif->pending_prod + vif->pending_cons;
 }
 
+static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring)
+{
+	return ring->nr_ents -	(ring->sring->req_prod - ring->rsp_prod_pvt);
+}
+
 bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)
 {
 	RING_IDX prod, cons;
@@ -1932,9 +1937,36 @@  static inline int tx_work_todo(struct xenvif *vif)
 	return 0;
 }
 
+static void xenvif_dealloc_delay(unsigned long data)
+{
+	struct xenvif *vif = (struct xenvif *)data;
+
+	vif->dealloc_delay_timed_out = true;
+	wake_up(&vif->dealloc_wq);
+}
+
 static inline bool tx_dealloc_work_todo(struct xenvif *vif)
 {
-	return vif->dealloc_cons != vif->dealloc_prod;
+	if (vif->dealloc_cons != vif->dealloc_prod) {
+		if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) &&
+		    (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) &&
+		    !vif->dealloc_delay_timed_out) {
+			if (!timer_pending(&vif->dealloc_delay)) {
+				vif->dealloc_delay.function =
+					xenvif_dealloc_delay;
+				vif->dealloc_delay.data = (unsigned long)vif;
+				mod_timer(&vif->dealloc_delay,
+					  jiffies + msecs_to_jiffies(1));
+
+			}
+			return false;
+		}
+		del_timer_sync(&vif->dealloc_delay);
+		vif->dealloc_delay_timed_out = false;
+		return true;
+	}
+
+	return false;
 }
 
 void xenvif_unmap_frontend_rings(struct xenvif *vif)