diff mbox

net: increase skb->users instead of skb_clone()

Message ID 1292479045-3136-1-git-send-email-xiaosuo@gmail.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Changli Gao Dec. 16, 2010, 5:57 a.m. UTC
In dev_queue_xmit_nit(), we have to clone skbs as we need to mangle skbs,
however, we don't need to clone skbs for all the packet_types.

Except for the first packet_type, we increase skb->users instead of
skb_clone().

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 net/core/dev.c |   30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet Dec. 16, 2010, 7:18 a.m. UTC | #1
Le jeudi 16 décembre 2010 à 13:57 +0800, Changli Gao a écrit :
> In dev_queue_xmit_nit(), we have to clone skbs as we need to mangle skbs,
> however, we don't need to clone skbs for all the packet_types.
> 
> Except for the first packet_type, we increase skb->users instead of
> skb_clone().
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ---
>  net/core/dev.c |   30 ++++++++++++++++++++----------
>  1 file changed, 20 insertions(+), 10 deletions(-)
> diff --git a/net/core/dev.c b/net/core/dev.c
> index bf5ced5..888cb74 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1496,6 +1496,14 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
>  }
>  EXPORT_SYMBOL_GPL(dev_forward_skb);
>  
> +static inline int deliver_skb(struct sk_buff *skb,
> +			      struct packet_type *pt_prev,
> +			      struct net_device *orig_dev)
> +{
> +	atomic_inc(&skb->users);
> +	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> +}
> +
>  /*
>   *	Support routine. Sends outgoing frames to any network
>   *	taps currently in use.
> @@ -1504,6 +1512,8 @@ EXPORT_SYMBOL_GPL(dev_forward_skb);
>  static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct packet_type *ptype;
> +	struct sk_buff *skb2 = NULL;
> +	struct packet_type *pt_prev = NULL;
>  
>  #ifdef CONFIG_NET_CLS_ACT
>  	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
> @@ -1520,7 +1530,13 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
>  		if ((ptype->dev == dev || !ptype->dev) &&
>  		    (ptype->af_packet_priv == NULL ||
>  		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
> -			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
> +			if (pt_prev) {
> +				deliver_skb(skb2, pt_prev, skb->dev);
> +				pt_prev = ptype;
> +				continue;
> +			}
> +
> +			skb2 = skb_clone(skb, GFP_ATOMIC);
>  			if (!skb2)
>  				break;
>  
> @@ -1542,9 +1558,11 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
>  
>  			skb2->transport_header = skb2->network_header;
>  			skb2->pkt_type = PACKET_OUTGOING;
> -			ptype->func(skb2, skb->dev, ptype, skb->dev);
> +			pt_prev = ptype;
>  		}
>  	}
> +	if (pt_prev)
> +		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
>  	rcu_read_unlock();
>  }
>  
> @@ -2788,14 +2806,6 @@ static void net_tx_action(struct softirq_action *h)
>  	}
>  }
>  
> -static inline int deliver_skb(struct sk_buff *skb,
> -			      struct packet_type *pt_prev,
> -			      struct net_device *orig_dev)
> -{
> -	atomic_inc(&skb->users);
> -	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> -}
> -
>  #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
>      (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
>  /* This hook is defined here for ATM LANE */

You beat me, but I was thinking of a different way, adding a new
pt_prev->xmit_func(), handling all the details (no need for atomic ops
on skb users if packet is not delivered at all).

By the way, your patch is not 100% safe/OK, because af_packet rcv()
handler writes on skb (skb_pull() and all)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Dec. 16, 2010, 7:23 a.m. UTC | #2
On Thu, Dec 16, 2010 at 3:18 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> You beat me, but I was thinking of a different way, adding a new
> pt_prev->xmit_func(), handling all the details (no need for atomic ops
> on skb users if packet is not delivered at all).
>
> By the way, your patch is not 100% safe/OK, because af_packet rcv()
> handler writes on skb (skb_pull() and all)
>

But af_packet_rcv() restores skbs at last.

        if (skb_head != skb->data && skb_shared(skb)) {
                skb->data = skb_head;
                skb->len = skb_len;
        }
Eric Dumazet Dec. 16, 2010, 7:50 a.m. UTC | #3
Le jeudi 16 décembre 2010 à 15:23 +0800, Changli Gao a écrit :
> On Thu, Dec 16, 2010 at 3:18 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > You beat me, but I was thinking of a different way, adding a new
> > pt_prev->xmit_func(), handling all the details (no need for atomic ops
> > on skb users if packet is not delivered at all).
> >
> > By the way, your patch is not 100% safe/OK, because af_packet rcv()
> > handler writes on skb (skb_pull() and all)
> >
> 
> But af_packet_rcv() restores skbs at last.
> 
>         if (skb_head != skb->data && skb_shared(skb)) {
>                 skb->data = skb_head;
>                 skb->len = skb_len;
>         }
> 

Thats right, your patch seems fine, thanks !

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Junchang Wang Dec. 16, 2010, 2:05 p.m. UTC | #4
On Thu, Dec 16, 2010 at 1:57 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> In dev_queue_xmit_nit(), we have to clone skbs as we need to mangle skbs,
> however, we don't need to clone skbs for all the packet_types.
>
> Except for the first packet_type, we increase skb->users instead of
> skb_clone().

Hi Changli,
Take af_packet for example, I can't see benefit from this patch.

> +static inline int deliver_skb(struct sk_buff *skb,
> +                             struct packet_type *pt_prev,
> +                             struct net_device *orig_dev)
> +{
> +       atomic_inc(&skb->users);
> +       return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> +}
The increment call will incur skb_shared() failure in packet_rcv.
In reality, packet_rcv has to clone this packet by itself.


Thanks
Changli Gao Dec. 16, 2010, 2:12 p.m. UTC | #5
On Thu, Dec 16, 2010 at 10:05 PM, Junchang Wang <junchangwang@gmail.com> wrote:
>
> Hi Changli,
> Take af_packet for example, I can't see benefit from this patch.
>
>> +static inline int deliver_skb(struct sk_buff *skb,
>> +                             struct packet_type *pt_prev,
>> +                             struct net_device *orig_dev)
>> +{
>> +       atomic_inc(&skb->users);
>> +       return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>> +}
> The increment call will incur skb_shared() failure in packet_rcv.
> In reality, packet_rcv has to clone this packet by itself.
>
>

This happens when run_filter returns non zero. For your case, only
small parts of packets match bpf filter.
Eric Dumazet Dec. 16, 2010, 2:18 p.m. UTC | #6
Le jeudi 16 décembre 2010 à 22:05 +0800, Junchang Wang a écrit :
> On Thu, Dec 16, 2010 at 1:57 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> > In dev_queue_xmit_nit(), we have to clone skbs as we need to mangle skbs,
> > however, we don't need to clone skbs for all the packet_types.
> >
> > Except for the first packet_type, we increase skb->users instead of
> > skb_clone().
> 
> Hi Changli,
> Take af_packet for example, I can't see benefit from this patch.
> 
> > +static inline int deliver_skb(struct sk_buff *skb,
> > +                             struct packet_type *pt_prev,
> > +                             struct net_device *orig_dev)
> > +{
> > +       atomic_inc(&skb->users);
> > +       return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> > +}
> The increment call will incur skb_shared() failure in packet_rcv.
> In reality, packet_rcv has to clone this packet by itself.
> 

Yes, and no.

Consider the case you have one receiver.

Packet given after Changli patch wont be shared, so packet_rcv wont
clone it : Thats a win. Only one skb_clone() done instead of two.

Consider case with 2 receivers :

First time we call packet_rcv, packet is shared (because we call
deliver_skb(), so packet_rcv clones it. Normal situation, we really need
to clone it.

Second time, we give a non shared packet : Thats a win over previous
situation.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Junchang Wang Dec. 16, 2010, 2:20 p.m. UTC | #7
On Thu, Dec 16, 2010 at 3:23 PM, Changli Gao <xiaosuo@gmail.com> wrote:

>> You beat me, but I was thinking of a different way, adding a new
>> pt_prev->xmit_func(), handling all the details (no need for atomic ops
>> on skb users if packet is not delivered at all).
>>
>> By the way, your patch is not 100% safe/OK, because af_packet rcv()
>> handler writes on skb (skb_pull() and all)
>>
>
> But af_packet_rcv() restores skbs at last.
>
>        if (skb_head != skb->data && skb_shared(skb)) {
>                skb->data = skb_head;
>                skb->len = skb_len;
>        }
>
If af packet_rcv invokes skb_clone, this skb is differ from the original one.
Eric's warning is right.
Eric Dumazet Dec. 16, 2010, 2:30 p.m. UTC | #8
Le jeudi 16 décembre 2010 à 22:20 +0800, Junchang Wang a écrit :
> On Thu, Dec 16, 2010 at 3:23 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> 
> >> You beat me, but I was thinking of a different way, adding a new
> >> pt_prev->xmit_func(), handling all the details (no need for atomic ops
> >> on skb users if packet is not delivered at all).
> >>
> >> By the way, your patch is not 100% safe/OK, because af_packet rcv()
> >> handler writes on skb (skb_pull() and all)
> >>
> >
> > But af_packet_rcv() restores skbs at last.
> >
> >        if (skb_head != skb->data && skb_shared(skb)) {
> >                skb->data = skb_head;
> >                skb->len = skb_len;
> >        }
> >
> If af packet_rcv invokes skb_clone, this skb is differ from the original one.
> Eric's warning is right.

It was a false alarm.

If packet_rcv() invokes skb_clone(), skb still points to original skb.
No worry.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Junchang Wang Dec. 16, 2010, 2:31 p.m. UTC | #9
On Thu, Dec 16, 2010 at 10:18 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Yes, and no.
>
> Consider the case you have one receiver.
>
> Packet given after Changli patch wont be shared, so packet_rcv wont
> clone it : Thats a win. Only one skb_clone() done instead of two.
>
> Consider case with 2 receivers :
>
> First time we call packet_rcv, packet is shared (because we call
> deliver_skb(), so packet_rcv clones it. Normal situation, we really need
> to clone it.

Got it. Thanks.

>
> Second time, we give a non shared packet : Thats a win over previous
> situation.
>
But, if we have N receivers, we get only the last one win - the first N-1 will
call deliver_skb().


Thanks.
Junchang Wang Dec. 16, 2010, 2:36 p.m. UTC | #10
On Thu, Dec 16, 2010 at 10:12 PM, Changli Gao <xiaosuo@gmail.com> wrote:

>
> This happens when run_filter returns non zero. For your case, only
> small parts of packets match bpf filter.
Hi Changli,
In most cases, I want user-space applications see everything. :)

Thanks.
Eric Dumazet Dec. 16, 2010, 2:41 p.m. UTC | #11
Le jeudi 16 décembre 2010 à 22:31 +0800, Junchang Wang a écrit :
> On Thu, Dec 16, 2010 at 10:18 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > Yes, and no.
> >
> > Consider the case you have one receiver.
> >
> > Packet given after Changli patch wont be shared, so packet_rcv wont
> > clone it : Thats a win. Only one skb_clone() done instead of two.
> >
> > Consider case with 2 receivers :
> >
> > First time we call packet_rcv, packet is shared (because we call
> > deliver_skb(), so packet_rcv clones it. Normal situation, we really need
> > to clone it.
> 
> Got it. Thanks.
> 
> >
> > Second time, we give a non shared packet : Thats a win over previous
> > situation.
> >
> But, if we have N receivers, we get only the last one win - the first N-1 will
> call deliver_skb().
> 

Yes, but you want to, because each receiver has to make a private copy
of the skb.

The big win is that if packet if filtered out (not accepted by the
socket filter), you end with no extra skb_clone() at all.

Say you have 8 receivers, with a filter matching some hash/cpu, and only
one af_packet socket will take the message.

Before patch : 8 skb_clones()

After patch : one skb_clone()

If I undertood patch intent ;)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Dec. 16, 2010, 2:43 p.m. UTC | #12
On Thu, Dec 16, 2010 at 10:41 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Yes, but you want to, because each receiver has to make a private copy
> of the skb.
>
> The big win is that if packet if filtered out (not accepted by the
> socket filter), you end with no extra skb_clone() at all.
>
> Say you have 8 receivers, with a filter matching some hash/cpu, and only
> one af_packet socket will take the message.
>
> Before patch : 8 skb_clones()
>
> After patch : one skb_clone()
>
> If I undertood patch intent ;)
>

Yes, you got it. :)
Junchang Wang Dec. 17, 2010, 12:24 a.m. UTC | #13
On Thu, Dec 16, 2010 at 10:41 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> But, if we have N receivers, we get only the last one win - the first N-1 will
>> call deliver_skb().
>>
>
> Yes, but you want to, because each receiver has to make a private copy
> of the skb.
>
> The big win is that if packet if filtered out (not accepted by the
> socket filter), you end with no extra skb_clone() at all.
>
> Say you have 8 receivers, with a filter matching some hash/cpu, and only
> one af_packet socket will take the message.
>
> Before patch : 8 skb_clones()
>
> After patch : one skb_clone()
>
Now I understand. :)
The patch is fine. Thanks Changli and Eric.
David Miller Dec. 20, 2010, 5:50 a.m. UTC | #14
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 16 Dec 2010 13:57:25 +0800

> In dev_queue_xmit_nit(), we have to clone skbs as we need to mangle skbs,
> however, we don't need to clone skbs for all the packet_types.
> 
> Except for the first packet_type, we increase skb->users instead of
> skb_clone().
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/dev.c b/net/core/dev.c
index bf5ced5..888cb74 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1496,6 +1496,14 @@  int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
 
+static inline int deliver_skb(struct sk_buff *skb,
+			      struct packet_type *pt_prev,
+			      struct net_device *orig_dev)
+{
+	atomic_inc(&skb->users);
+	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
 /*
  *	Support routine. Sends outgoing frames to any network
  *	taps currently in use.
@@ -1504,6 +1512,8 @@  EXPORT_SYMBOL_GPL(dev_forward_skb);
 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct packet_type *ptype;
+	struct sk_buff *skb2 = NULL;
+	struct packet_type *pt_prev = NULL;
 
 #ifdef CONFIG_NET_CLS_ACT
 	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
@@ -1520,7 +1530,13 @@  static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 		if ((ptype->dev == dev || !ptype->dev) &&
 		    (ptype->af_packet_priv == NULL ||
 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
-			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+			if (pt_prev) {
+				deliver_skb(skb2, pt_prev, skb->dev);
+				pt_prev = ptype;
+				continue;
+			}
+
+			skb2 = skb_clone(skb, GFP_ATOMIC);
 			if (!skb2)
 				break;
 
@@ -1542,9 +1558,11 @@  static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 
 			skb2->transport_header = skb2->network_header;
 			skb2->pkt_type = PACKET_OUTGOING;
-			ptype->func(skb2, skb->dev, ptype, skb->dev);
+			pt_prev = ptype;
 		}
 	}
+	if (pt_prev)
+		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 	rcu_read_unlock();
 }
 
@@ -2788,14 +2806,6 @@  static void net_tx_action(struct softirq_action *h)
 	}
 }
 
-static inline int deliver_skb(struct sk_buff *skb,
-			      struct packet_type *pt_prev,
-			      struct net_device *orig_dev)
-{
-	atomic_inc(&skb->users);
-	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
-}
-
 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
 /* This hook is defined here for ATM LANE */