diff mbox

[net-next-2.6,V3] net: convert bonding to use rx_handler

Message ID 4D6027B9.6050108@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Nicolas de Pesloüan Feb. 19, 2011, 8:27 p.m. UTC
Le 19/02/2011 14:46, Jiri Pirko a écrit :
> Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>> Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>> Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>> Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>> Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>> This patch converts bonding to use rx_handler. Results in cleaner
>>>>>> __netif_receive_skb() with much less exceptions needed. Also
>>>>>> bond-specific work is moved into bond code.
>>>>>>
>>>>>> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>>
>>>>>> v1->v2:
>>>>>>          using skb_iif instead of new input_dev to remember original
>>>>>> 	device
>>>>>> v2->v3:
>>>>>> 	set orig_dev = skb->dev if skb_iif is set
>>>>>>
>>>>>
>>>>> Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>>
>>>>> Bonding used to be handled with very few overhead, simply replacing
>>>>> skb->dev with skb->dev->master. Time has passed and we eventually
>>>>> added many special processing for bonding into __netif_receive_skb(),
>>>>> but the overhead remained very light.
>>>>>
>>>>> Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>>
>>>>> Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>> whatever need to be delivered, to whoever need, inside the loop ?
>>>>>
>>>>> rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>> while (rx_handler) {
>>>>> 	/* ...  */
>>>>> 	orig_dev = skb->dev;
>>>>> 	skb = rx_handler(skb);
>>>>> 	/* ... */
>>>>> 	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>> }
>>>>>
>>>>> This would reduce the overhead, while still allowing nesting: vlan on
>>>>> top on bonding, bridge on top on bonding, ...
>>>>
>>>> I see your point. Makes sense to me. But the loop would have to include
>>>> at least processing of ptype_all too. I'm going to cook a follow-up
>>>> patch.
>>>>
>>>
>>> DRAFT (doesn't modify rx_handlers):
>>>
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index 4ebf7fe..e5dba47 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>   {
>>>   	struct packet_type *ptype, *pt_prev;
>>>   	rx_handler_func_t *rx_handler;
>>> +	struct net_device *dev;
>>>   	struct net_device *orig_dev;
>>>   	struct net_device *null_or_dev;
>>>   	int ret = NET_RX_DROP;
>>> @@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>   	if (netpoll_receive_skb(skb))
>>>   		return NET_RX_DROP;
>>>
>>> -	__this_cpu_inc(softnet_data.processed);
>>> +	skb->skb_iif = skb->dev->ifindex;
>>> +	orig_dev = skb->dev;
>>
>> orig_dev should be set inside the loop, to reflect "previously
>> crossed device", while following the path:
>>
>> eth0 ->  bond0 ->  br0.
>>
>> First step inside loop:
>>
>> orig_dev = eth0
>> skb->dev = bond0 (at the end of the loop).
>>
>> Second step inside loop:
>>
>> orig_dev = bond0
>> skb->dev = br0 (et the end of the loop).
>>
>> This would allow for exact match delivery to bond0 if someone bind there.
>>
>>> +
>>>   	skb_reset_network_header(skb);
>>>   	skb_reset_transport_header(skb);
>>>   	skb->mac_len = skb->network_header - skb->mac_header;
>>> @@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>
>>>   	rcu_read_lock();
>>>
>>> -	if (!skb->skb_iif) {
>>> -		skb->skb_iif = skb->dev->ifindex;
>>> -		orig_dev = skb->dev;
>>> -	} else {
>>> -		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>> -	}
>>
>> I like the fact that it removes the above part.
>>
>>> +another_round:
>>> +	__this_cpu_inc(softnet_data.processed);
>>> +	dev = skb->dev;
>>>
>>>   #ifdef CONFIG_NET_CLS_ACT
>>>   	if (skb->tc_verd&   TC_NCLS) {
>>> @@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>   #endif
>>>
>>>   	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>> -		if (!ptype->dev || ptype->dev == skb->dev) {
>>> +		if (!ptype->dev || ptype->dev == dev) {
>>>   			if (pt_prev)
>>>   				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>   			pt_prev = ptype;
>>
>> Inside the loop, we should only do exact match delivery, for
>> &ptype_all and for&ptype_base[ntohs(type)&  PTYPE_HASH_MASK]:
>>
>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> -               if (!ptype->dev || ptype->dev == dev) {
>> +               if (ptype->dev == dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>>
>>         list_for_each_entry_rcu(ptype,
>>                         &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>                 if (ptype->type == type&&
>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> +                   (ptype->dev == skb->dev)) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>> After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>
>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> -               if (!ptype->dev || ptype->dev == dev) {
>> +               if (!ptype->dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                }
>>         }
>>
>>
>>         list_for_each_entry_rcu(ptype,
>>                         &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>> -               if (ptype->type == type&&
>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> +		if (ptype->type == type&&  !ptype->dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>> This would reduce the number of tests inside the
>> list_for_each_entry_rcu() loops. And because we match only ptype->dev
>> == dev inside the loop and !ptype->dev outside the loop, this should
>> avoid duplicate delivery.
>
> Would you care to put this into patch so I can see the whole picture?
> Thanks.

Here is what I have in mind. It is based on your previous DRAFT patch, and don't modify rx_handlers yet.

Only compile tested !!

I don't know if every pieces are at the right place. I wonder what to do with CONFIG_NET_CLS_ACT 
part, that currently is between ptype_all and ptype_base processing.

Anyway, the general idea is there.

	Nicolas.

  net/core/dev.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
  1 files changed, 60 insertions(+), 10 deletions(-)

Comments

Jiri Pirko Feb. 20, 2011, 10:36 a.m. UTC | #1
Sat, Feb 19, 2011 at 09:27:37PM CET, nicolas.2p.debian@gmail.com wrote:
>Le 19/02/2011 14:46, Jiri Pirko a écrit :
>>Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>>>Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>>>Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>>>Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>>>Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>>>This patch converts bonding to use rx_handler. Results in cleaner
>>>>>>>__netif_receive_skb() with much less exceptions needed. Also
>>>>>>>bond-specific work is moved into bond code.
>>>>>>>
>>>>>>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>>>
>>>>>>>v1->v2:
>>>>>>>         using skb_iif instead of new input_dev to remember original
>>>>>>>	device
>>>>>>>v2->v3:
>>>>>>>	set orig_dev = skb->dev if skb_iif is set
>>>>>>>
>>>>>>
>>>>>>Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>>>
>>>>>>Bonding used to be handled with very few overhead, simply replacing
>>>>>>skb->dev with skb->dev->master. Time has passed and we eventually
>>>>>>added many special processing for bonding into __netif_receive_skb(),
>>>>>>but the overhead remained very light.
>>>>>>
>>>>>>Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>>>
>>>>>>Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>>>whatever need to be delivered, to whoever need, inside the loop ?
>>>>>>
>>>>>>rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>>>while (rx_handler) {
>>>>>>	/* ...  */
>>>>>>	orig_dev = skb->dev;
>>>>>>	skb = rx_handler(skb);
>>>>>>	/* ... */
>>>>>>	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>>>}
>>>>>>
>>>>>>This would reduce the overhead, while still allowing nesting: vlan on
>>>>>>top on bonding, bridge on top on bonding, ...
>>>>>
>>>>>I see your point. Makes sense to me. But the loop would have to include
>>>>>at least processing of ptype_all too. I'm going to cook a follow-up
>>>>>patch.
>>>>>
>>>>
>>>>DRAFT (doesn't modify rx_handlers):
>>>>
>>>>diff --git a/net/core/dev.c b/net/core/dev.c
>>>>index 4ebf7fe..e5dba47 100644
>>>>--- a/net/core/dev.c
>>>>+++ b/net/core/dev.c
>>>>@@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>  {
>>>>  	struct packet_type *ptype, *pt_prev;
>>>>  	rx_handler_func_t *rx_handler;
>>>>+	struct net_device *dev;
>>>>  	struct net_device *orig_dev;
>>>>  	struct net_device *null_or_dev;
>>>>  	int ret = NET_RX_DROP;
>>>>@@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>  	if (netpoll_receive_skb(skb))
>>>>  		return NET_RX_DROP;
>>>>
>>>>-	__this_cpu_inc(softnet_data.processed);
>>>>+	skb->skb_iif = skb->dev->ifindex;
>>>>+	orig_dev = skb->dev;
>>>
>>>orig_dev should be set inside the loop, to reflect "previously
>>>crossed device", while following the path:
>>>
>>>eth0 ->  bond0 ->  br0.
>>>
>>>First step inside loop:
>>>
>>>orig_dev = eth0
>>>skb->dev = bond0 (at the end of the loop).
>>>
>>>Second step inside loop:
>>>
>>>orig_dev = bond0
>>>skb->dev = br0 (et the end of the loop).
>>>
>>>This would allow for exact match delivery to bond0 if someone bind there.
>>>
>>>>+
>>>>  	skb_reset_network_header(skb);
>>>>  	skb_reset_transport_header(skb);
>>>>  	skb->mac_len = skb->network_header - skb->mac_header;
>>>>@@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>
>>>>  	rcu_read_lock();
>>>>
>>>>-	if (!skb->skb_iif) {
>>>>-		skb->skb_iif = skb->dev->ifindex;
>>>>-		orig_dev = skb->dev;
>>>>-	} else {
>>>>-		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>>>-	}
>>>
>>>I like the fact that it removes the above part.
>>>
>>>>+another_round:
>>>>+	__this_cpu_inc(softnet_data.processed);
>>>>+	dev = skb->dev;
>>>>
>>>>  #ifdef CONFIG_NET_CLS_ACT
>>>>  	if (skb->tc_verd&   TC_NCLS) {
>>>>@@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>  #endif
>>>>
>>>>  	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>-		if (!ptype->dev || ptype->dev == skb->dev) {
>>>>+		if (!ptype->dev || ptype->dev == dev) {
>>>>  			if (pt_prev)
>>>>  				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>  			pt_prev = ptype;
>>>
>>>Inside the loop, we should only do exact match delivery, for
>>>&ptype_all and for&ptype_base[ntohs(type)&  PTYPE_HASH_MASK]:
>>>
>>>        list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>-               if (!ptype->dev || ptype->dev == dev) {
>>>+               if (ptype->dev == dev) {
>>>                        if (pt_prev)
>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>                        pt_prev = ptype;
>>>                }
>>>        }
>>>
>>>
>>>        list_for_each_entry_rcu(ptype,
>>>                        &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>>                if (ptype->type == type&&
>>>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>+                   (ptype->dev == skb->dev)) {
>>>                        if (pt_prev)
>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>                        pt_prev = ptype;
>>>                }
>>>        }
>>>
>>>After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>>
>>>        list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>-               if (!ptype->dev || ptype->dev == dev) {
>>>+               if (!ptype->dev) {
>>>                        if (pt_prev)
>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>                        pt_prev = ptype;
>>>               }
>>>        }
>>>
>>>
>>>        list_for_each_entry_rcu(ptype,
>>>                        &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>>-               if (ptype->type == type&&
>>>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>+		if (ptype->type == type&&  !ptype->dev) {
>>>                        if (pt_prev)
>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>                        pt_prev = ptype;
>>>                }
>>>        }
>>>
>>>This would reduce the number of tests inside the
>>>list_for_each_entry_rcu() loops. And because we match only ptype->dev
>>>== dev inside the loop and !ptype->dev outside the loop, this should
>>>avoid duplicate delivery.
>>
>>Would you care to put this into patch so I can see the whole picture?
>>Thanks.
>
>Here is what I have in mind. It is based on your previous DRAFT patch, and don't modify rx_handlers yet.
>
>Only compile tested !!
>
>I don't know if every pieces are at the right place. I wonder what to
>do with CONFIG_NET_CLS_ACT part, that currently is between ptype_all
>and ptype_base processing.
>
>Anyway, the general idea is there.
>
>	Nicolas.
>
> net/core/dev.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
> 1 files changed, 60 insertions(+), 10 deletions(-)
>
>diff --git a/net/core/dev.c b/net/core/dev.c
>index e5dba47..7e007a9 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -3117,7 +3117,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
> 	rx_handler_func_t *rx_handler;
> 	struct net_device *dev;
> 	struct net_device *orig_dev;
>-	struct net_device *null_or_dev;
> 	int ret = NET_RX_DROP;
> 	__be16 type;
>
>@@ -3130,9 +3129,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
> 	if (netpoll_receive_skb(skb))
> 		return NET_RX_DROP;
>
>-	skb->skb_iif = skb->dev->ifindex;
>-	orig_dev = skb->dev;
>-
> 	skb_reset_network_header(skb);
> 	skb_reset_transport_header(skb);
> 	skb->mac_len = skb->network_header - skb->mac_header;
>@@ -3143,6 +3139,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>
> another_round:
> 	__this_cpu_inc(softnet_data.processed);
>+	skb->skb_iif = skb->dev->ifindex;
>+	orig_dev = skb->dev;
orig_dev should be set at the end of the loop. Now you are going to have
it always the same as dev and skb->dev.

> 	dev = skb->dev;
>
> #ifdef CONFIG_NET_CLS_ACT
>@@ -3152,8 +3150,13 @@ another_round:
> 	}
> #endif
>
>+	/*
>+	 * Deliver to ptype_all protocol handlers that match current dev.
>+	 * This happens before rx_handler is given a chance to change skb->dev.
>+	 */
>+
> 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>-		if (!ptype->dev || ptype->dev == dev) {
>+		if (ptype->dev == dev) {
> 			if (pt_prev)
> 				ret = deliver_skb(skb, pt_prev, orig_dev);
> 			pt_prev = ptype;
>@@ -3167,6 +3170,31 @@ another_round:
> ncls:
> #endif
>
>+	/*
>+	 * Deliver to ptype_base protocol handlers that match current dev.
>+	 * This happens before rx_handler is given a chance to change skb->dev.
>+	 */
>+
>+	type = skb->protocol;
>+	list_for_each_entry_rcu(ptype,
>+			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
>+		if (ptype->type == type && ptype->dev == skb->dev) {
>+			if (pt_prev)
>+				ret = deliver_skb(skb, pt_prev, orig_dev);
>+			pt_prev = ptype;
>+		}
>+	}

I'm not sure it is ok to deliver ptype_base here. See comment above
ptype_head() (I'm not sure I understand that correctly)

>+
>+	/*
>+	 * Call rx_handler for current device.
>+	 * If rx_handler return NULL, skip wilcard protocol handler delivery.
>+	 * Else, if skb->dev changed, restart the whole delivery process, to
>+	 * allow for device nesting.
>+	 *
>+	 * Warning:
>+	 * rx_handlers must kfree_skb(skb) if they return NULL.
Well this is not true. They can return NULL and call netif_rx as they
have before. No changes necessary I believe.

>+	 */
>+
> 	rx_handler = rcu_dereference(dev->rx_handler);
> 	if (rx_handler) {
> 		if (pt_prev) {
>@@ -3176,10 +3204,15 @@ ncls:
> 		skb = rx_handler(skb);
> 		if (!skb)
> 			goto out;
>-		if (dev != skb->dev)
>+		if (skb->dev != dev)
> 			goto another_round;
> 	}
>
>+	/*
>+	 * FIXME: The part below should use rx_handler instead of being hard
>+	 * coded here.
I'm not sure it is doable atm. For bridge and bond it should not be a
problem, but for macvlan, there is possible to have macvlans and vlans
on the same dev. This possibility should persist.
/me scratches head on the idea to have multiple rx_handlers although it
was his original idea....


>+	 */
>+
> 	if (vlan_tx_tag_present(skb)) {
> 		if (pt_prev) {
> 			ret = deliver_skb(skb, pt_prev, orig_dev);
>@@ -3192,16 +3225,33 @@ ncls:
> 			goto out;
> 	}
>
>+	/*
>+	 * FIXME: Can't this be moved into the rx_handler for bonding,
>+	 * or into a futur rx_handler for vlan?
This hook is something I do not like at all :/ But anyway if should be in vlan
part I think.

>+	 */
>+
> 	vlan_on_bond_hook(skb);
>
>-	/* deliver only exact match when indicated */
>-	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
>+	/*
>+	 * Deliver to wildcard ptype_all protocol handlers.
>+	 */
>+
>+	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>+		if (!ptype->dev) {
>+			if (pt_prev)
>+				ret = deliver_skb(skb, pt_prev, orig_dev);
>+			pt_prev = ptype;
>+		}
>+	}
>+
>+	/*
>+	 * Deliver to wildcard ptype_all protocol handlers.
>+	 */
>
> 	type = skb->protocol;
> 	list_for_each_entry_rcu(ptype,
> 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
>-		if (ptype->type == type &&
>-		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>+		if (ptype->type == type && !ptype->dev) {
> 			if (pt_prev)
> 				ret = deliver_skb(skb, pt_prev, orig_dev);
> 			pt_prev = ptype;
>-- 
>1.7.2.3
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas de Pesloüan Feb. 20, 2011, 12:12 p.m. UTC | #2
Le 20/02/2011 11:36, Jiri Pirko a écrit :
> Sat, Feb 19, 2011 at 09:27:37PM CET, nicolas.2p.debian@gmail.com wrote:
>> Le 19/02/2011 14:46, Jiri Pirko a écrit :
>>> Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>>>> Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>>>> Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>>>> Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>>>> Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>>>> This patch converts bonding to use rx_handler. Results in cleaner
>>>>>>>> __netif_receive_skb() with much less exceptions needed. Also
>>>>>>>> bond-specific work is moved into bond code.
>>>>>>>>
>>>>>>>> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>>>>
>>>>>>>> v1->v2:
>>>>>>>>          using skb_iif instead of new input_dev to remember original
>>>>>>>> 	device
>>>>>>>> v2->v3:
>>>>>>>> 	set orig_dev = skb->dev if skb_iif is set
>>>>>>>>
>>>>>>>
>>>>>>> Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>>>>
>>>>>>> Bonding used to be handled with very few overhead, simply replacing
>>>>>>> skb->dev with skb->dev->master. Time has passed and we eventually
>>>>>>> added many special processing for bonding into __netif_receive_skb(),
>>>>>>> but the overhead remained very light.
>>>>>>>
>>>>>>> Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>>>>
>>>>>>> Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>>>> whatever need to be delivered, to whoever need, inside the loop ?
>>>>>>>
>>>>>>> rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>>>> while (rx_handler) {
>>>>>>> 	/* ...  */
>>>>>>> 	orig_dev = skb->dev;
>>>>>>> 	skb = rx_handler(skb);
>>>>>>> 	/* ... */
>>>>>>> 	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>>>> }
>>>>>>>
>>>>>>> This would reduce the overhead, while still allowing nesting: vlan on
>>>>>>> top on bonding, bridge on top on bonding, ...
>>>>>>
>>>>>> I see your point. Makes sense to me. But the loop would have to include
>>>>>> at least processing of ptype_all too. I'm going to cook a follow-up
>>>>>> patch.
>>>>>>
>>>>>
>>>>> DRAFT (doesn't modify rx_handlers):
>>>>>
>>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>>> index 4ebf7fe..e5dba47 100644
>>>>> --- a/net/core/dev.c
>>>>> +++ b/net/core/dev.c
>>>>> @@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>   {
>>>>>   	struct packet_type *ptype, *pt_prev;
>>>>>   	rx_handler_func_t *rx_handler;
>>>>> +	struct net_device *dev;
>>>>>   	struct net_device *orig_dev;
>>>>>   	struct net_device *null_or_dev;
>>>>>   	int ret = NET_RX_DROP;
>>>>> @@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>   	if (netpoll_receive_skb(skb))
>>>>>   		return NET_RX_DROP;
>>>>>
>>>>> -	__this_cpu_inc(softnet_data.processed);
>>>>> +	skb->skb_iif = skb->dev->ifindex;
>>>>> +	orig_dev = skb->dev;
>>>>
>>>> orig_dev should be set inside the loop, to reflect "previously
>>>> crossed device", while following the path:
>>>>
>>>> eth0 ->   bond0 ->   br0.
>>>>
>>>> First step inside loop:
>>>>
>>>> orig_dev = eth0
>>>> skb->dev = bond0 (at the end of the loop).
>>>>
>>>> Second step inside loop:
>>>>
>>>> orig_dev = bond0
>>>> skb->dev = br0 (et the end of the loop).
>>>>
>>>> This would allow for exact match delivery to bond0 if someone bind there.
>>>>
>>>>> +
>>>>>   	skb_reset_network_header(skb);
>>>>>   	skb_reset_transport_header(skb);
>>>>>   	skb->mac_len = skb->network_header - skb->mac_header;
>>>>> @@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>
>>>>>   	rcu_read_lock();
>>>>>
>>>>> -	if (!skb->skb_iif) {
>>>>> -		skb->skb_iif = skb->dev->ifindex;
>>>>> -		orig_dev = skb->dev;
>>>>> -	} else {
>>>>> -		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>>>> -	}
>>>>
>>>> I like the fact that it removes the above part.
>>>>
>>>>> +another_round:
>>>>> +	__this_cpu_inc(softnet_data.processed);
>>>>> +	dev = skb->dev;
>>>>>
>>>>>   #ifdef CONFIG_NET_CLS_ACT
>>>>>   	if (skb->tc_verd&    TC_NCLS) {
>>>>> @@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>   #endif
>>>>>
>>>>>   	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>> -		if (!ptype->dev || ptype->dev == skb->dev) {
>>>>> +		if (!ptype->dev || ptype->dev == dev) {
>>>>>   			if (pt_prev)
>>>>>   				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>   			pt_prev = ptype;
>>>>
>>>> Inside the loop, we should only do exact match delivery, for
>>>> &ptype_all and for&ptype_base[ntohs(type)&   PTYPE_HASH_MASK]:
>>>>
>>>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>> -               if (!ptype->dev || ptype->dev == dev) {
>>>> +               if (ptype->dev == dev) {
>>>>                         if (pt_prev)
>>>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>                         pt_prev = ptype;
>>>>                 }
>>>>         }
>>>>
>>>>
>>>>         list_for_each_entry_rcu(ptype,
>>>>                         &ptype_base[ntohs(type)&   PTYPE_HASH_MASK], list) {
>>>>                 if (ptype->type == type&&
>>>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>> +                   (ptype->dev == skb->dev)) {
>>>>                         if (pt_prev)
>>>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>                         pt_prev = ptype;
>>>>                 }
>>>>         }
>>>>
>>>> After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>>>
>>>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>> -               if (!ptype->dev || ptype->dev == dev) {
>>>> +               if (!ptype->dev) {
>>>>                         if (pt_prev)
>>>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>                         pt_prev = ptype;
>>>>                }
>>>>         }
>>>>
>>>>
>>>>         list_for_each_entry_rcu(ptype,
>>>>                         &ptype_base[ntohs(type)&   PTYPE_HASH_MASK], list) {
>>>> -               if (ptype->type == type&&
>>>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>> +		if (ptype->type == type&&   !ptype->dev) {
>>>>                         if (pt_prev)
>>>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>                         pt_prev = ptype;
>>>>                 }
>>>>         }
>>>>
>>>> This would reduce the number of tests inside the
>>>> list_for_each_entry_rcu() loops. And because we match only ptype->dev
>>>> == dev inside the loop and !ptype->dev outside the loop, this should
>>>> avoid duplicate delivery.
>>>
>>> Would you care to put this into patch so I can see the whole picture?
>>> Thanks.
>>
>> Here is what I have in mind. It is based on your previous DRAFT patch, and don't modify rx_handlers yet.
>>
>> Only compile tested !!
>>
>> I don't know if every pieces are at the right place. I wonder what to
>> do with CONFIG_NET_CLS_ACT part, that currently is between ptype_all
>> and ptype_base processing.
>>
>> Anyway, the general idea is there.
>>
>> 	Nicolas.
>>
>> net/core/dev.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
>> 1 files changed, 60 insertions(+), 10 deletions(-)
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index e5dba47..7e007a9 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3117,7 +3117,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>> 	rx_handler_func_t *rx_handler;
>> 	struct net_device *dev;
>> 	struct net_device *orig_dev;
>> -	struct net_device *null_or_dev;
>> 	int ret = NET_RX_DROP;
>> 	__be16 type;
>>
>> @@ -3130,9 +3129,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>> 	if (netpoll_receive_skb(skb))
>> 		return NET_RX_DROP;
>>
>> -	skb->skb_iif = skb->dev->ifindex;
>> -	orig_dev = skb->dev;
>> -
>> 	skb_reset_network_header(skb);
>> 	skb_reset_transport_header(skb);
>> 	skb->mac_len = skb->network_header - skb->mac_header;
>> @@ -3143,6 +3139,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>
>> another_round:
>> 	__this_cpu_inc(softnet_data.processed);
>> +	skb->skb_iif = skb->dev->ifindex;
>> +	orig_dev = skb->dev;
> orig_dev should be set at the end of the loop. Now you are going to have
> it always the same as dev and skb->dev.
>

Yes, you are right.

I thinking about all this, I wonder what the protocol handlers expect as the orig_dev value ?

Lest imagine the following configuration: eth0 -> bond0 -> br0.

What does a protocol handler listening on br0 expect for orig_dev ? bond0 or eth0 ? Current 
implementation give eth0, but I think bond0 should be the right value, for proper nesting.

>> 	dev = skb->dev;
>>
>> #ifdef CONFIG_NET_CLS_ACT
>> @@ -3152,8 +3150,13 @@ another_round:
>> 	}
>> #endif
>>
>> +	/*
>> +	 * Deliver to ptype_all protocol handlers that match current dev.
>> +	 * This happens before rx_handler is given a chance to change skb->dev.
>> +	 */
>> +
>> 	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> -		if (!ptype->dev || ptype->dev == dev) {
>> +		if (ptype->dev == dev) {
>> 			if (pt_prev)
>> 				ret = deliver_skb(skb, pt_prev, orig_dev);
>> 			pt_prev = ptype;
>> @@ -3167,6 +3170,31 @@ another_round:
>> ncls:
>> #endif
>>
>> +	/*
>> +	 * Deliver to ptype_base protocol handlers that match current dev.
>> +	 * This happens before rx_handler is given a chance to change skb->dev.
>> +	 */
>> +
>> +	type = skb->protocol;
>> +	list_for_each_entry_rcu(ptype,
>> +			&ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>> +		if (ptype->type == type&&  ptype->dev == skb->dev) {
>> +			if (pt_prev)
>> +				ret = deliver_skb(skb, pt_prev, orig_dev);
>> +			pt_prev = ptype;
>> +		}
>> +	}
>
> I'm not sure it is ok to deliver ptype_base here. See comment above
> ptype_head() (I'm not sure I understand that correctly)

Anyway, all this is probably plain wrong: Delivering the skb to protocol handlers while still 
changing the skb is guaranteed to cause strange behaviors.

If we want to be able to deliver the skb to different protocol handlers and give all of them the 
right values for dev->skb and orig_dev (or previous_dev), we might end up with copying the skb. I 
hate the idea, but currently can't find a cleaner way to do so.

We first need to clarify what orig_dev should be, as stated above.

>> +
>> +	/*
>> +	 * Call rx_handler for current device.
>> +	 * If rx_handler return NULL, skip wilcard protocol handler delivery.
>> +	 * Else, if skb->dev changed, restart the whole delivery process, to
>> +	 * allow for device nesting.
>> +	 *
>> +	 * Warning:
>> +	 * rx_handlers must kfree_skb(skb) if they return NULL.
> Well this is not true. They can return NULL and call netif_rx as they
> have before. No changes necessary I believe.

I don't really know. This needs to be double checked, anyway.

>> +	 */
>> +
>> 	rx_handler = rcu_dereference(dev->rx_handler);
>> 	if (rx_handler) {
>> 		if (pt_prev) {
>> @@ -3176,10 +3204,15 @@ ncls:
>> 		skb = rx_handler(skb);
>> 		if (!skb)
>> 			goto out;
>> -		if (dev != skb->dev)
>> +		if (skb->dev != dev)
>> 			goto another_round;
>> 	}
>>
>> +	/*
>> +	 * FIXME: The part below should use rx_handler instead of being hard
>> +	 * coded here.
> I'm not sure it is doable atm. For bridge and bond it should not be a
> problem, but for macvlan, there is possible to have macvlans and vlans
> on the same dev. This possibility should persist.
> /me scratches head on the idea to have multiple rx_handlers although it
> was his original idea....

I think your original proposal of having several rx_handlers per device was right.

At the time you introduced the rx_handler system, only bridge and macvlan used it. Even if using 
bridge and macvlan on the same base device might be useless, this is not true for every possible 
rx_handler configuration.

Now that we want to move bonding and vlan to the rx_handler system, it becomes obvious that we need 
several rx_handlers per device. At least, vlan should properly mix with bridge. And who know what 
would be the fifth rx_handler...

>> +	 */
>> +
>> 	if (vlan_tx_tag_present(skb)) {
>> 		if (pt_prev) {
>> 			ret = deliver_skb(skb, pt_prev, orig_dev);
>> @@ -3192,16 +3225,33 @@ ncls:
>> 			goto out;
>> 	}
>>
>> +	/*
>> +	 * FIXME: Can't this be moved into the rx_handler for bonding,
>> +	 * or into a futur rx_handler for vlan?
> This hook is something I do not like at all :/ But anyway if should be in vlan
> part I think.

Yes, and in order for the future rx_handler for vlan to properly handle it, it needs to know the 
device just below it, not the pure original device. Hence, my question about the exact meaning of 
orig_dev...

	Nicolas.

>> +	 */
>> +
>> 	vlan_on_bond_hook(skb);
>>
>> -	/* deliver only exact match when indicated */
>> -	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
>> +	/*
>> +	 * Deliver to wildcard ptype_all protocol handlers.
>> +	 */
>> +
>> +	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> +		if (!ptype->dev) {
>> +			if (pt_prev)
>> +				ret = deliver_skb(skb, pt_prev, orig_dev);
>> +			pt_prev = ptype;
>> +		}
>> +	}
>> +
>> +	/*
>> +	 * Deliver to wildcard ptype_all protocol handlers.
>> +	 */
>>
>> 	type = skb->protocol;
>> 	list_for_each_entry_rcu(ptype,
>> 			&ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>> -		if (ptype->type == type&&
>> -		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> +		if (ptype->type == type&&  !ptype->dev) {
>> 			if (pt_prev)
>> 				ret = deliver_skb(skb, pt_prev, orig_dev);
>> 			pt_prev = ptype;
>> --
>> 1.7.2.3
>>
>>
>>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Feb. 20, 2011, 3:07 p.m. UTC | #3
Sun, Feb 20, 2011 at 01:12:01PM CET, nicolas.2p.debian@gmail.com wrote:
>Le 20/02/2011 11:36, Jiri Pirko a écrit :
>>Sat, Feb 19, 2011 at 09:27:37PM CET, nicolas.2p.debian@gmail.com wrote:
>>>Le 19/02/2011 14:46, Jiri Pirko a écrit :
>>>>Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>>>>>Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>>>>>Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>>>>>Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>>>>>Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>>>>>This patch converts bonding to use rx_handler. Results in cleaner
>>>>>>>>>__netif_receive_skb() with much less exceptions needed. Also
>>>>>>>>>bond-specific work is moved into bond code.
>>>>>>>>>
>>>>>>>>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>>>>>
>>>>>>>>>v1->v2:
>>>>>>>>>         using skb_iif instead of new input_dev to remember original
>>>>>>>>>	device
>>>>>>>>>v2->v3:
>>>>>>>>>	set orig_dev = skb->dev if skb_iif is set
>>>>>>>>>
>>>>>>>>
>>>>>>>>Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>>>>>
>>>>>>>>Bonding used to be handled with very few overhead, simply replacing
>>>>>>>>skb->dev with skb->dev->master. Time has passed and we eventually
>>>>>>>>added many special processing for bonding into __netif_receive_skb(),
>>>>>>>>but the overhead remained very light.
>>>>>>>>
>>>>>>>>Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>>>>>
>>>>>>>>Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>>>>>whatever need to be delivered, to whoever need, inside the loop ?
>>>>>>>>
>>>>>>>>rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>>>>>while (rx_handler) {
>>>>>>>>	/* ...  */
>>>>>>>>	orig_dev = skb->dev;
>>>>>>>>	skb = rx_handler(skb);
>>>>>>>>	/* ... */
>>>>>>>>	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>>>>>}
>>>>>>>>
>>>>>>>>This would reduce the overhead, while still allowing nesting: vlan on
>>>>>>>>top on bonding, bridge on top on bonding, ...
>>>>>>>
>>>>>>>I see your point. Makes sense to me. But the loop would have to include
>>>>>>>at least processing of ptype_all too. I'm going to cook a follow-up
>>>>>>>patch.
>>>>>>>
>>>>>>
>>>>>>DRAFT (doesn't modify rx_handlers):
>>>>>>
>>>>>>diff --git a/net/core/dev.c b/net/core/dev.c
>>>>>>index 4ebf7fe..e5dba47 100644
>>>>>>--- a/net/core/dev.c
>>>>>>+++ b/net/core/dev.c
>>>>>>@@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>>  {
>>>>>>  	struct packet_type *ptype, *pt_prev;
>>>>>>  	rx_handler_func_t *rx_handler;
>>>>>>+	struct net_device *dev;
>>>>>>  	struct net_device *orig_dev;
>>>>>>  	struct net_device *null_or_dev;
>>>>>>  	int ret = NET_RX_DROP;
>>>>>>@@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>>  	if (netpoll_receive_skb(skb))
>>>>>>  		return NET_RX_DROP;
>>>>>>
>>>>>>-	__this_cpu_inc(softnet_data.processed);
>>>>>>+	skb->skb_iif = skb->dev->ifindex;
>>>>>>+	orig_dev = skb->dev;
>>>>>
>>>>>orig_dev should be set inside the loop, to reflect "previously
>>>>>crossed device", while following the path:
>>>>>
>>>>>eth0 ->   bond0 ->   br0.
>>>>>
>>>>>First step inside loop:
>>>>>
>>>>>orig_dev = eth0
>>>>>skb->dev = bond0 (at the end of the loop).
>>>>>
>>>>>Second step inside loop:
>>>>>
>>>>>orig_dev = bond0
>>>>>skb->dev = br0 (et the end of the loop).
>>>>>
>>>>>This would allow for exact match delivery to bond0 if someone bind there.
>>>>>
>>>>>>+
>>>>>>  	skb_reset_network_header(skb);
>>>>>>  	skb_reset_transport_header(skb);
>>>>>>  	skb->mac_len = skb->network_header - skb->mac_header;
>>>>>>@@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>>
>>>>>>  	rcu_read_lock();
>>>>>>
>>>>>>-	if (!skb->skb_iif) {
>>>>>>-		skb->skb_iif = skb->dev->ifindex;
>>>>>>-		orig_dev = skb->dev;
>>>>>>-	} else {
>>>>>>-		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>>>>>-	}
>>>>>
>>>>>I like the fact that it removes the above part.
>>>>>
>>>>>>+another_round:
>>>>>>+	__this_cpu_inc(softnet_data.processed);
>>>>>>+	dev = skb->dev;
>>>>>>
>>>>>>  #ifdef CONFIG_NET_CLS_ACT
>>>>>>  	if (skb->tc_verd&    TC_NCLS) {
>>>>>>@@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>>  #endif
>>>>>>
>>>>>>  	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>>>-		if (!ptype->dev || ptype->dev == skb->dev) {
>>>>>>+		if (!ptype->dev || ptype->dev == dev) {
>>>>>>  			if (pt_prev)
>>>>>>  				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>>  			pt_prev = ptype;
>>>>>
>>>>>Inside the loop, we should only do exact match delivery, for
>>>>>&ptype_all and for&ptype_base[ntohs(type)&   PTYPE_HASH_MASK]:
>>>>>
>>>>>        list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>>-               if (!ptype->dev || ptype->dev == dev) {
>>>>>+               if (ptype->dev == dev) {
>>>>>                        if (pt_prev)
>>>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>                        pt_prev = ptype;
>>>>>                }
>>>>>        }
>>>>>
>>>>>
>>>>>        list_for_each_entry_rcu(ptype,
>>>>>                        &ptype_base[ntohs(type)&   PTYPE_HASH_MASK], list) {
>>>>>                if (ptype->type == type&&
>>>>>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>>>+                   (ptype->dev == skb->dev)) {
>>>>>                        if (pt_prev)
>>>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>                        pt_prev = ptype;
>>>>>                }
>>>>>        }
>>>>>
>>>>>After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>>>>
>>>>>        list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>>-               if (!ptype->dev || ptype->dev == dev) {
>>>>>+               if (!ptype->dev) {
>>>>>                        if (pt_prev)
>>>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>                        pt_prev = ptype;
>>>>>               }
>>>>>        }
>>>>>
>>>>>
>>>>>        list_for_each_entry_rcu(ptype,
>>>>>                        &ptype_base[ntohs(type)&   PTYPE_HASH_MASK], list) {
>>>>>-               if (ptype->type == type&&
>>>>>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>>>+		if (ptype->type == type&&   !ptype->dev) {
>>>>>                        if (pt_prev)
>>>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>                        pt_prev = ptype;
>>>>>                }
>>>>>        }
>>>>>
>>>>>This would reduce the number of tests inside the
>>>>>list_for_each_entry_rcu() loops. And because we match only ptype->dev
>>>>>== dev inside the loop and !ptype->dev outside the loop, this should
>>>>>avoid duplicate delivery.
>>>>
>>>>Would you care to put this into patch so I can see the whole picture?
>>>>Thanks.
>>>
>>>Here is what I have in mind. It is based on your previous DRAFT patch, and don't modify rx_handlers yet.
>>>
>>>Only compile tested !!
>>>
>>>I don't know if every pieces are at the right place. I wonder what to
>>>do with CONFIG_NET_CLS_ACT part, that currently is between ptype_all
>>>and ptype_base processing.
>>>
>>>Anyway, the general idea is there.
>>>
>>>	Nicolas.
>>>
>>>net/core/dev.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
>>>1 files changed, 60 insertions(+), 10 deletions(-)
>>>
>>>diff --git a/net/core/dev.c b/net/core/dev.c
>>>index e5dba47..7e007a9 100644
>>>--- a/net/core/dev.c
>>>+++ b/net/core/dev.c
>>>@@ -3117,7 +3117,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>	rx_handler_func_t *rx_handler;
>>>	struct net_device *dev;
>>>	struct net_device *orig_dev;
>>>-	struct net_device *null_or_dev;
>>>	int ret = NET_RX_DROP;
>>>	__be16 type;
>>>
>>>@@ -3130,9 +3129,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>	if (netpoll_receive_skb(skb))
>>>		return NET_RX_DROP;
>>>
>>>-	skb->skb_iif = skb->dev->ifindex;
>>>-	orig_dev = skb->dev;
>>>-
>>>	skb_reset_network_header(skb);
>>>	skb_reset_transport_header(skb);
>>>	skb->mac_len = skb->network_header - skb->mac_header;
>>>@@ -3143,6 +3139,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>
>>>another_round:
>>>	__this_cpu_inc(softnet_data.processed);
>>>+	skb->skb_iif = skb->dev->ifindex;
>>>+	orig_dev = skb->dev;
>>orig_dev should be set at the end of the loop. Now you are going to have
>>it always the same as dev and skb->dev.
>>
>
>Yes, you are right.
>
>I thinking about all this, I wonder what the protocol handlers expect as the orig_dev value ?
>
>Lest imagine the following configuration: eth0 -> bond0 -> br0.
>
>What does a protocol handler listening on br0 expect for orig_dev ?
>bond0 or eth0 ? Current implementation give eth0, but I think bond0
>should be the right value, for proper nesting.

I agree with you.

>
>>>	dev = skb->dev;
>>>
>>>#ifdef CONFIG_NET_CLS_ACT
>>>@@ -3152,8 +3150,13 @@ another_round:
>>>	}
>>>#endif
>>>
>>>+	/*
>>>+	 * Deliver to ptype_all protocol handlers that match current dev.
>>>+	 * This happens before rx_handler is given a chance to change skb->dev.
>>>+	 */
>>>+
>>>	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>-		if (!ptype->dev || ptype->dev == dev) {
>>>+		if (ptype->dev == dev) {
>>>			if (pt_prev)
>>>				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>			pt_prev = ptype;
>>>@@ -3167,6 +3170,31 @@ another_round:
>>>ncls:
>>>#endif
>>>
>>>+	/*
>>>+	 * Deliver to ptype_base protocol handlers that match current dev.
>>>+	 * This happens before rx_handler is given a chance to change skb->dev.
>>>+	 */
>>>+
>>>+	type = skb->protocol;
>>>+	list_for_each_entry_rcu(ptype,
>>>+			&ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>>+		if (ptype->type == type&&  ptype->dev == skb->dev) {
>>>+			if (pt_prev)
>>>+				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>+			pt_prev = ptype;
>>>+		}
>>>+	}
>>
>>I'm not sure it is ok to deliver ptype_base here. See comment above
>>ptype_head() (I'm not sure I understand that correctly)
>
>Anyway, all this is probably plain wrong: Delivering the skb to
>protocol handlers while still changing the skb is guaranteed to cause
>strange behaviors.
>
>If we want to be able to deliver the skb to different protocol
>handlers and give all of them the right values for dev->skb and
>orig_dev (or previous_dev), we might end up with copying the skb. I
>hate the idea, but currently can't find a cleaner way to do so.

That would be unfortunate :/
>
>We first need to clarify what orig_dev should be, as stated above.
>
>>>+
>>>+	/*
>>>+	 * Call rx_handler for current device.
>>>+	 * If rx_handler return NULL, skip wilcard protocol handler delivery.
>>>+	 * Else, if skb->dev changed, restart the whole delivery process, to
>>>+	 * allow for device nesting.
>>>+	 *
>>>+	 * Warning:
>>>+	 * rx_handlers must kfree_skb(skb) if they return NULL.
>>Well this is not true. They can return NULL and call netif_rx as they
>>have before. No changes necessary I believe.
>
>I don't really know. This needs to be double checked, anyway.
>
>>>+	 */
>>>+
>>>	rx_handler = rcu_dereference(dev->rx_handler);
>>>	if (rx_handler) {
>>>		if (pt_prev) {
>>>@@ -3176,10 +3204,15 @@ ncls:
>>>		skb = rx_handler(skb);
>>>		if (!skb)
>>>			goto out;
>>>-		if (dev != skb->dev)
>>>+		if (skb->dev != dev)
>>>			goto another_round;
>>>	}
>>>
>>>+	/*
>>>+	 * FIXME: The part below should use rx_handler instead of being hard
>>>+	 * coded here.
>>I'm not sure it is doable atm. For bridge and bond it should not be a
>>problem, but for macvlan, there is possible to have macvlans and vlans
>>on the same dev. This possibility should persist.
>>/me scratches head on the idea to have multiple rx_handlers although it
>>was his original idea....
>
>I think your original proposal of having several rx_handlers per device was right.
>
>At the time you introduced the rx_handler system, only bridge and
>macvlan used it. Even if using bridge and macvlan on the same base
>device might be useless, this is not true for every possible
>rx_handler configuration.
>
>Now that we want to move bonding and vlan to the rx_handler system,
>it becomes obvious that we need several rx_handlers per device. At
>least, vlan should properly mix with bridge. And who know what would
>be the fifth rx_handler...
>
>>>+	 */
>>>+
>>>	if (vlan_tx_tag_present(skb)) {
>>>		if (pt_prev) {
>>>			ret = deliver_skb(skb, pt_prev, orig_dev);
>>>@@ -3192,16 +3225,33 @@ ncls:
>>>			goto out;
>>>	}
>>>
>>>+	/*
>>>+	 * FIXME: Can't this be moved into the rx_handler for bonding,
>>>+	 * or into a futur rx_handler for vlan?
>>This hook is something I do not like at all :/ But anyway if should be in vlan
>>part I think.
>
>Yes, and in order for the future rx_handler for vlan to properly
>handle it, it needs to know the device just below it, not the pure
>original device. Hence, my question about the exact meaning of
>orig_dev...
>
>	Nicolas.
>
>>>+	 */
>>>+
>>>	vlan_on_bond_hook(skb);
>>>
>>>-	/* deliver only exact match when indicated */
>>>-	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
>>>+	/*
>>>+	 * Deliver to wildcard ptype_all protocol handlers.
>>>+	 */
>>>+
>>>+	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>+		if (!ptype->dev) {
>>>+			if (pt_prev)
>>>+				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>+			pt_prev = ptype;
>>>+		}
>>>+	}
>>>+
>>>+	/*
>>>+	 * Deliver to wildcard ptype_all protocol handlers.
>>>+	 */
>>>
>>>	type = skb->protocol;
>>>	list_for_each_entry_rcu(ptype,
>>>			&ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>>-		if (ptype->type == type&&
>>>-		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>+		if (ptype->type == type&&  !ptype->dev) {
>>>			if (pt_prev)
>>>				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>			pt_prev = ptype;
>>>--
>>>1.7.2.3
>>>
>>>
>>>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas de Pesloüan Feb. 21, 2011, 11:20 p.m. UTC | #4
Le 20/02/2011 16:07, Jiri Pirko a écrit :
> Sun, Feb 20, 2011 at 01:12:01PM CET, nicolas.2p.debian@gmail.com wrote:

[snip]

>> And thinking about all this, I wonder what the protocol handlers expect as the orig_dev value ?
>>
>> Lets imagine the following configuration: eth0 ->  bond0 ->  br0.
>>
>> What does a protocol handler listening on br0 expect for orig_dev ?
>> bond0 or eth0 ? Current implementation give eth0, but I think bond0
>> should be the right value, for proper nesting.
>
> I agree with you.

[snip}

>>> This hook is something I do not like at all :/ But anyway if should be in vlan
>>> part I think.
>>
>> Yes, and in order for the future rx_handler for vlan to properly
>> handle it, it needs to know the device just below it, not the pure
>> original device. Hence, my question about the exact meaning of
>> orig_dev...

After checking every protocol handlers installed by dev_add_pack(), it appears that only 4 of them 
really use the orig_dev parameter given by __netif_receive_skb():

- bond_3ad_lacpdu_recv() @ drivers/net/bonding/bond_3ad.c
- bond_arp_recv() @ drivers/net/bonding/bond_main.c
- packet_rcv() @ net/packet/af_packet.c
- tpacket_rcv() @ net/packet/af_packet.c

 From the bonding point of view, the meaning of orig_dev is obviously "the device one layer below 
the bonding device, through which the packet reached the bonding device". It is used by 
bond_3ad_lacpdu_recv() and bond_arp_recv(), to find the underlying slave device through which the 
LACPDU or ARP was received. (The protocol handler is registered at the bonding device level).

 From the af_packet point of view, the meaning is documented (in commit "[AF_PACKET]: Add option to 
return orig_dev to userspace") as the "physical device [that] actually received the traffic, instead 
of having the encapsulating device hide that information."

When the bonding device is just one level above the physical device, the two meanings happen to 
match the same device, by chance.

So, currently, a bonding device cannot stack properly on top of anything but physical devices. It 
might not be a problem today, but may change in the future...

	Nicolas.






--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas de Pesloüan Feb. 26, 2011, 2:24 p.m. UTC | #5
Le 22/02/2011 00:20, Nicolas de Pesloüan a écrit :

> After checking every protocol handlers installed by dev_add_pack(), it
> appears that only 4 of them really use the orig_dev parameter given by
> __netif_receive_skb():
>
> - bond_3ad_lacpdu_recv() @ drivers/net/bonding/bond_3ad.c
> - bond_arp_recv() @ drivers/net/bonding/bond_main.c
> - packet_rcv() @ net/packet/af_packet.c
> - tpacket_rcv() @ net/packet/af_packet.c
>
>  From the bonding point of view, the meaning of orig_dev is obviously
> "the device one layer below the bonding device, through which the packet
> reached the bonding device". It is used by bond_3ad_lacpdu_recv() and
> bond_arp_recv(), to find the underlying slave device through which the
> LACPDU or ARP was received. (The protocol handler is registered at the
> bonding device level).
>
>  From the af_packet point of view, the meaning is documented (in commit
> "[AF_PACKET]: Add option to return orig_dev to userspace") as the
> "physical device [that] actually received the traffic, instead of having
> the encapsulating device hide that information."
>
> When the bonding device is just one level above the physical device, the
> two meanings happen to match the same device, by chance.
>
> So, currently, a bonding device cannot stack properly on top of anything
> but physical devices. It might not be a problem today, but may change in
> the future...

Hi Jay,

Still thinking about this orig_dev stuff, I wonder why the protocol handlers used in bonding 
(bond_3ad_lacpdu_recv() and bond_arp_rcv()) are registered at the master level instead of at the 
slave level ?

If they were registered at the slave level, they would simply receive skb->dev as the ingress 
interface and use this value instead of needing the orig_dev value given to them when they are 
registered at the master level.

As orig_dev is only used by bonding and by af_packet, but they disagree on the exact meaning of 
orig_dev, one way to fix this discrepancy would be to remove one of the usage. As the af_packet 
usage is exposed to user space, bonding seems the right place to stop using orig_dev, even if 
orig_dev was introduced for bonding :-)

I understand that this would add one entry per slave device to the ptype_base list, but this seems 
to be the only bad effect of registering at the slave level. Can you confirm that this was the 
reason to register at the master level instead?

If you think registering at the slave level would cause too much impact on ptype_base, then we might 
have another way to stop using orig_dev for bonding:

In __skb_bond_should_drop(), we already test for the two interesting protocols:

if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && skb->protocol == __cpu_to_be16(ETH_P_ARP))
	return 0;

if (master->priv_flags & IFF_MASTER_8023AD && skb->protocol == __cpu_to_be16(ETH_P_SLOW))
	return 0;

Would it be possible to call the right handlers directly from inside __skb_bond_should_drop() then 
let __skb_bond_should_drop() return 1 ("should drop") after processing the frames that are only of 
interest for bonding?

	Nicolas.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jay Vosburgh Feb. 26, 2011, 7:42 p.m. UTC | #6
Nicolas de Pesloüan 	<nicolas.2p.debian@gmail.com> wrote:

>Le 22/02/2011 00:20, Nicolas de Pesloüan a écrit :
>
>> After checking every protocol handlers installed by dev_add_pack(), it
>> appears that only 4 of them really use the orig_dev parameter given by
>> __netif_receive_skb():
>>
>> - bond_3ad_lacpdu_recv() @ drivers/net/bonding/bond_3ad.c
>> - bond_arp_recv() @ drivers/net/bonding/bond_main.c
>> - packet_rcv() @ net/packet/af_packet.c
>> - tpacket_rcv() @ net/packet/af_packet.c
>>
>>  From the bonding point of view, the meaning of orig_dev is obviously
>> "the device one layer below the bonding device, through which the packet
>> reached the bonding device". It is used by bond_3ad_lacpdu_recv() and
>> bond_arp_recv(), to find the underlying slave device through which the
>> LACPDU or ARP was received. (The protocol handler is registered at the
>> bonding device level).
>>
>>  From the af_packet point of view, the meaning is documented (in commit
>> "[AF_PACKET]: Add option to return orig_dev to userspace") as the
>> "physical device [that] actually received the traffic, instead of having
>> the encapsulating device hide that information."
>>
>> When the bonding device is just one level above the physical device, the
>> two meanings happen to match the same device, by chance.
>>
>> So, currently, a bonding device cannot stack properly on top of anything
>> but physical devices. It might not be a problem today, but may change in
>> the future...
>
>Hi Jay,
>
>Still thinking about this orig_dev stuff, I wonder why the protocol
>handlers used in bonding (bond_3ad_lacpdu_recv() and bond_arp_rcv()) are
>registered at the master level instead of at the slave level ?
>
>If they were registered at the slave level, they would simply receive
>skb->dev as the ingress interface and use this value instead of needing
>the orig_dev value given to them when they are registered at the master
>level.
>
>As orig_dev is only used by bonding and by af_packet, but they disagree on
>the exact meaning of orig_dev, one way to fix this discrepancy would be to
>remove one of the usage. As the af_packet usage is exposed to user space,
>bonding seems the right place to stop using orig_dev, even if orig_dev was
>introduced for bonding :-)
>
>I understand that this would add one entry per slave device to the
>ptype_base list, but this seems to be the only bad effect of registering
>at the slave level. Can you confirm that this was the reason to register
>at the master level instead?

	My recollection is that it was done the way it is because there
was no "orig_dev" delivery logic at the time.  A handler registered to a
slave dev would receive no packets at all because assignment of skb->dev
to the master happened first, and the "orig_dev" knowledge was lost.

	When 802.3ad was added, a skb->real_dev field was created, but
it wasn't used for delivery.  802.3ad used real_dev to figure out which
slave a LACPDU arrived on.  The skb->real_dev was eventually replaced
with the orig_dev business that's there now.

	Later, I did the arp_validate stuff the same way as 802.3ad
because it worked and was easier than registering a handler per slave.

>If you think registering at the slave level would cause too much impact on
>ptype_base, then we might have another way to stop using orig_dev for
>bonding:
>
>In __skb_bond_should_drop(), we already test for the two interesting protocols:
>
>if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && skb->protocol == __cpu_to_be16(ETH_P_ARP))
>	return 0;
>
>if (master->priv_flags & IFF_MASTER_8023AD && skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>	return 0;
>
>Would it be possible to call the right handlers directly from inside
>__skb_bond_should_drop() then let __skb_bond_should_drop() return 1
>("should drop") after processing the frames that are only of interest for
>bonding?

	Isn't one purpose of switching to rx_handler that there won't
need to be any skb_bond_should_drop logic in __netif_receive_skb at all?

	Still, if you're just trying to simplify __netif_receive_skb
first, I don't see any reason not to register the packet handlers at the
slave level.  Looking at the ptype_base hash, I don't think that the
protocols bonding is registering (ARP and SLOW) will hash collide with
IP or IPv6, so I suspect there won't be much impact.

	Once an rx_handler is used, then I suspect there's no need for
the packet handlers at all, since the rx_handler is within bonding and
can just deal with the ARP or LACPDU directly.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Feb. 27, 2011, 12:58 p.m. UTC | #7
Sat, Feb 26, 2011 at 08:42:57PM CET, fubar@us.ibm.com wrote:
>Nicolas de Pesloüan 	<nicolas.2p.debian@gmail.com> wrote:
>
>>Le 22/02/2011 00:20, Nicolas de Pesloüan a écrit :
>>
>>> After checking every protocol handlers installed by dev_add_pack(), it
>>> appears that only 4 of them really use the orig_dev parameter given by
>>> __netif_receive_skb():
>>>
>>> - bond_3ad_lacpdu_recv() @ drivers/net/bonding/bond_3ad.c
>>> - bond_arp_recv() @ drivers/net/bonding/bond_main.c
>>> - packet_rcv() @ net/packet/af_packet.c
>>> - tpacket_rcv() @ net/packet/af_packet.c
>>>
>>>  From the bonding point of view, the meaning of orig_dev is obviously
>>> "the device one layer below the bonding device, through which the packet
>>> reached the bonding device". It is used by bond_3ad_lacpdu_recv() and
>>> bond_arp_recv(), to find the underlying slave device through which the
>>> LACPDU or ARP was received. (The protocol handler is registered at the
>>> bonding device level).
>>>
>>>  From the af_packet point of view, the meaning is documented (in commit
>>> "[AF_PACKET]: Add option to return orig_dev to userspace") as the
>>> "physical device [that] actually received the traffic, instead of having
>>> the encapsulating device hide that information."
>>>
>>> When the bonding device is just one level above the physical device, the
>>> two meanings happen to match the same device, by chance.
>>>
>>> So, currently, a bonding device cannot stack properly on top of anything
>>> but physical devices. It might not be a problem today, but may change in
>>> the future...
>>
>>Hi Jay,
>>
>>Still thinking about this orig_dev stuff, I wonder why the protocol
>>handlers used in bonding (bond_3ad_lacpdu_recv() and bond_arp_rcv()) are
>>registered at the master level instead of at the slave level ?
>>
>>If they were registered at the slave level, they would simply receive
>>skb->dev as the ingress interface and use this value instead of needing
>>the orig_dev value given to them when they are registered at the master
>>level.
>>
>>As orig_dev is only used by bonding and by af_packet, but they disagree on
>>the exact meaning of orig_dev, one way to fix this discrepancy would be to
>>remove one of the usage. As the af_packet usage is exposed to user space,
>>bonding seems the right place to stop using orig_dev, even if orig_dev was
>>introduced for bonding :-)
>>
>>I understand that this would add one entry per slave device to the
>>ptype_base list, but this seems to be the only bad effect of registering
>>at the slave level. Can you confirm that this was the reason to register
>>at the master level instead?
>
>	My recollection is that it was done the way it is because there
>was no "orig_dev" delivery logic at the time.  A handler registered to a
>slave dev would receive no packets at all because assignment of skb->dev
>to the master happened first, and the "orig_dev" knowledge was lost.
>
>	When 802.3ad was added, a skb->real_dev field was created, but
>it wasn't used for delivery.  802.3ad used real_dev to figure out which
>slave a LACPDU arrived on.  The skb->real_dev was eventually replaced
>with the orig_dev business that's there now.
>
>	Later, I did the arp_validate stuff the same way as 802.3ad
>because it worked and was easier than registering a handler per slave.
>
>>If you think registering at the slave level would cause too much impact on
>>ptype_base, then we might have another way to stop using orig_dev for
>>bonding:
>>
>>In __skb_bond_should_drop(), we already test for the two interesting protocols:
>>
>>if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && skb->protocol == __cpu_to_be16(ETH_P_ARP))
>>	return 0;
>>
>>if (master->priv_flags & IFF_MASTER_8023AD && skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>>	return 0;
>>
>>Would it be possible to call the right handlers directly from inside
>>__skb_bond_should_drop() then let __skb_bond_should_drop() return 1
>>("should drop") after processing the frames that are only of interest for
>>bonding?
>
>	Isn't one purpose of switching to rx_handler that there won't
>need to be any skb_bond_should_drop logic in __netif_receive_skb at all?

Yes, that (hopefully most)  would be eventually removed.

>
>	Still, if you're just trying to simplify __netif_receive_skb
>first, I don't see any reason not to register the packet handlers at the
>slave level.  Looking at the ptype_base hash, I don't think that the
>protocols bonding is registering (ARP and SLOW) will hash collide with
>IP or IPv6, so I suspect there won't be much impact.
>
>	Once an rx_handler is used, then I suspect there's no need for
>the packet handlers at all, since the rx_handler is within bonding and
>can just deal with the ARP or LACPDU directly.

That is very true. And given that af_packet uses orig_dev to obtain
ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
orig_dev parameter for good.

So I suggest to take V3 of my patch now and do multiple follow-on
patches to get us where we want to get.

Thanks

>
>	-J
>
>---
>	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas de Pesloüan Feb. 27, 2011, 8:44 p.m. UTC | #8
Le 27/02/2011 13:58, Jiri Pirko a écrit :
> Sat, Feb 26, 2011 at 08:42:57PM CET, fubar@us.ibm.com wrote:
>> Nicolas de Pesloüan 	<nicolas.2p.debian@gmail.com>  wrote:
>>> Hi Jay,
>>>
>>> Still thinking about this orig_dev stuff, I wonder why the protocol
>>> handlers used in bonding (bond_3ad_lacpdu_recv() and bond_arp_rcv()) are
>>> registered at the master level instead of at the slave level ?
>>>
>>> If they were registered at the slave level, they would simply receive
>>> skb->dev as the ingress interface and use this value instead of needing
>>> the orig_dev value given to them when they are registered at the master
>>> level.
>>>
>>> As orig_dev is only used by bonding and by af_packet, but they disagree on
>>> the exact meaning of orig_dev, one way to fix this discrepancy would be to
>>> remove one of the usage. As the af_packet usage is exposed to user space,
>>> bonding seems the right place to stop using orig_dev, even if orig_dev was
>>> introduced for bonding :-)
>>>
>>> I understand that this would add one entry per slave device to the
>>> ptype_base list, but this seems to be the only bad effect of registering
>>> at the slave level. Can you confirm that this was the reason to register
>>> at the master level instead?
>>
>> 	My recollection is that it was done the way it is because there
>> was no "orig_dev" delivery logic at the time.  A handler registered to a
>> slave dev would receive no packets at all because assignment of skb->dev
>> to the master happened first, and the "orig_dev" knowledge was lost.
>>
>> 	When 802.3ad was added, a skb->real_dev field was created, but
>> it wasn't used for delivery.  802.3ad used real_dev to figure out which
>> slave a LACPDU arrived on.  The skb->real_dev was eventually replaced
>> with the orig_dev business that's there now.
>>
>> 	Later, I did the arp_validate stuff the same way as 802.3ad
>> because it worked and was easier than registering a handler per slave.
>>
>>> If you think registering at the slave level would cause too much impact on
>>> ptype_base, then we might have another way to stop using orig_dev for
>>> bonding:
>>>
>>> In __skb_bond_should_drop(), we already test for the two interesting protocols:
>>>
>>> if ((dev->priv_flags&  IFF_SLAVE_NEEDARP)&&  skb->protocol == __cpu_to_be16(ETH_P_ARP))
>>> 	return 0;
>>>
>>> if (master->priv_flags&  IFF_MASTER_8023AD&&  skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>>> 	return 0;
>>>
>>> Would it be possible to call the right handlers directly from inside
>>> __skb_bond_should_drop() then let __skb_bond_should_drop() return 1
>>> ("should drop") after processing the frames that are only of interest for
>>> bonding?
>>
>> 	Isn't one purpose of switching to rx_handler that there won't
>> need to be any skb_bond_should_drop logic in __netif_receive_skb at all?
>
> Yes, that (hopefully most)  would be eventually removed.

The skb_bond_should_drop logic was simply moved from dev.c to 
bond_should_deliver_exact_match@bond_main.c by Jiri's patch.

But the logic remain and is necessary to decide whether we do normal delivery or only exact match 
delivery.

>> 	Still, if you're just trying to simplify __netif_receive_skb
>> first, I don't see any reason not to register the packet handlers at the
>> slave level.  Looking at the ptype_base hash, I don't think that the
>> protocols bonding is registering (ARP and SLOW) will hash collide with
>> IP or IPv6, so I suspect there won't be much impact.
>>
>> 	Once an rx_handler is used, then I suspect there's no need for
>> the packet handlers at all, since the rx_handler is within bonding and
>> can just deal with the ARP or LACPDU directly.
>
> That is very true. And given that af_packet uses orig_dev to obtain
> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
> orig_dev parameter for good.

Unfortunately, after doing some more research, I'm afraid we won't be able to suppress at least the 
ARP packet handler:

In commit 1f3c8804acba841b5573b953f5560d2683d2db0d (bonding: allow arp_ip_targets on separate vlans 
to use arp validation), Andy solved the problem of vlan on top of bonding, when the arp_ip_target is 
on one of the vlans:

eth0/eth1 -> bond0 -> bond0.100

At the time the frame is inspected by bonding, the frame is still tagged. This is true for the new 
rx_handler proposed by Jiri, and is also true for the former __skb_bond_should_drop() handling).

To receive the untagged frame, we would have to wait until the vlan code remove the tag. The current 
protocol handler seems to be the best way to catch the frame that late.

This is probably specific to ARP. I don't think SLOW frames can be tagged.

Anyway, Jay, thanks for you clarification.

> So I suggest to take V3 of my patch now and do multiple follow-on
> patches to get us where we want to get.

Agreed.

	Nicolas.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Feb. 27, 2011, 11:22 p.m. UTC | #9
From: Jiri Pirko <jpirko@redhat.com>
Date: Sun, 27 Feb 2011 13:58:17 +0100

> That is very true. And given that af_packet uses orig_dev to obtain
> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
> orig_dev parameter for good.

I would rather see a complete patch set submitting at a unit, thanks.

I've already marked your V3 last night as "changes requested" in
patchwork for this reason.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Feb. 28, 2011, 7:07 a.m. UTC | #10
Mon, Feb 28, 2011 at 12:22:08AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Sun, 27 Feb 2011 13:58:17 +0100
>
>> That is very true. And given that af_packet uses orig_dev to obtain
>> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
>> orig_dev parameter for good.
>
>I would rather see a complete patch set submitting at a unit, thanks.
>
>I've already marked your V3 last night as "changes requested" in
>patchwork for this reason.

That's a pity. V3 is complete patch. The changes we talk about in
discussion are just taking advantage of changes in patch V3. Would be in
my opinion better to apply V3 now and followup with the rest after that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Feb. 28, 2011, 7:30 a.m. UTC | #11
From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 28 Feb 2011 08:07:33 +0100

> Mon, Feb 28, 2011 at 12:22:08AM CET, davem@davemloft.net wrote:
>>From: Jiri Pirko <jpirko@redhat.com>
>>Date: Sun, 27 Feb 2011 13:58:17 +0100
>>
>>> That is very true. And given that af_packet uses orig_dev to obtain
>>> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
>>> orig_dev parameter for good.
>>
>>I would rather see a complete patch set submitting at a unit, thanks.
>>
>>I've already marked your V3 last night as "changes requested" in
>>patchwork for this reason.
> 
> That's a pity. V3 is complete patch. The changes we talk about in
> discussion are just taking advantage of changes in patch V3. Would be in
> my opinion better to apply V3 now and followup with the rest after that.

Fair enough, I'll aply it now, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Feb. 28, 2011, 9:22 a.m. UTC | #12
Mon, Feb 28, 2011 at 08:30:13AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Mon, 28 Feb 2011 08:07:33 +0100
>
>> Mon, Feb 28, 2011 at 12:22:08AM CET, davem@davemloft.net wrote:
>>>From: Jiri Pirko <jpirko@redhat.com>
>>>Date: Sun, 27 Feb 2011 13:58:17 +0100
>>>
>>>> That is very true. And given that af_packet uses orig_dev to obtain
>>>> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
>>>> orig_dev parameter for good.
>>>
>>>I would rather see a complete patch set submitting at a unit, thanks.
>>>
>>>I've already marked your V3 last night as "changes requested" in
>>>patchwork for this reason.
>> 
>> That's a pity. V3 is complete patch. The changes we talk about in
>> discussion are just taking advantage of changes in patch V3. Would be in
>> my opinion better to apply V3 now and followup with the rest after that.
>
>Fair enough, I'll aply it now, thanks.

Applied incorrectly. net/core/dev.c part is missing
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Feb. 28, 2011, 9:35 a.m. UTC | #13
Le lundi 28 février 2011 à 10:22 +0100, Jiri Pirko a écrit :

> Applied incorrectly. net/core/dev.c part is missing

Hmm, could you please provide a 2nd patch with this part ?

Thanks
 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Feb. 28, 2011, 6:49 p.m. UTC | #14
From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 28 Feb 2011 10:22:24 +0100

> Applied incorrectly. net/core/dev.c part is missing

Fixed, sorry.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/dev.c b/net/core/dev.c
index e5dba47..7e007a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3117,7 +3117,6 @@  static int __netif_receive_skb(struct sk_buff *skb)
  	rx_handler_func_t *rx_handler;
  	struct net_device *dev;
  	struct net_device *orig_dev;
-	struct net_device *null_or_dev;
  	int ret = NET_RX_DROP;
  	__be16 type;

@@ -3130,9 +3129,6 @@  static int __netif_receive_skb(struct sk_buff *skb)
  	if (netpoll_receive_skb(skb))
  		return NET_RX_DROP;

-	skb->skb_iif = skb->dev->ifindex;
-	orig_dev = skb->dev;
-
  	skb_reset_network_header(skb);
  	skb_reset_transport_header(skb);
  	skb->mac_len = skb->network_header - skb->mac_header;
@@ -3143,6 +3139,8 @@  static int __netif_receive_skb(struct sk_buff *skb)

  another_round:
  	__this_cpu_inc(softnet_data.processed);
+	skb->skb_iif = skb->dev->ifindex;
+	orig_dev = skb->dev;
  	dev = skb->dev;

  #ifdef CONFIG_NET_CLS_ACT
@@ -3152,8 +3150,13 @@  another_round:
  	}
  #endif

+	/*
+	 * Deliver to ptype_all protocol handlers that match current dev.
+	 * This happens before rx_handler is given a chance to change skb->dev.
+	 */
+
  	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (!ptype->dev || ptype->dev == dev) {
+		if (ptype->dev == dev) {
  			if (pt_prev)
  				ret = deliver_skb(skb, pt_prev, orig_dev);
  			pt_prev = ptype;
@@ -3167,6 +3170,31 @@  another_round:
  ncls:
  #endif

+	/*
+	 * Deliver to ptype_base protocol handlers that match current dev.
+	 * This happens before rx_handler is given a chance to change skb->dev.
+	 */
+
+	type = skb->protocol;
+	list_for_each_entry_rcu(ptype,
+			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
+		if (ptype->type == type && ptype->dev == skb->dev) {
+			if (pt_prev)
+				ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = ptype;
+		}
+	}
+
+	/*
+	 * Call rx_handler for current device.
+	 * If rx_handler return NULL, skip wilcard protocol handler delivery.
+	 * Else, if skb->dev changed, restart the whole delivery process, to
+	 * allow for device nesting.
+	 *
+	 * Warning:
+	 * rx_handlers must kfree_skb(skb) if they return NULL.
+	 */
+
  	rx_handler = rcu_dereference(dev->rx_handler);
  	if (rx_handler) {
  		if (pt_prev) {
@@ -3176,10 +3204,15 @@  ncls:
  		skb = rx_handler(skb);
  		if (!skb)
  			goto out;
-		if (dev != skb->dev)
+		if (skb->dev != dev)
  			goto another_round;
  	}

+	/*
+	 * FIXME: The part below should use rx_handler instead of being hard
+	 * coded here.
+	 */
+
  	if (vlan_tx_tag_present(skb)) {
  		if (pt_prev) {
  			ret = deliver_skb(skb, pt_prev, orig_dev);
@@ -3192,16 +3225,33 @@  ncls:
  			goto out;
  	}

+	/*
+	 * FIXME: Can't this be moved into the rx_handler for bonding,
+	 * or into a futur rx_handler for vlan?
+	 */
+
  	vlan_on_bond_hook(skb);

-	/* deliver only exact match when indicated */
-	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
+	/*
+	 * Deliver to wildcard ptype_all protocol handlers.
+	 */
+
+	list_for_each_entry_rcu(ptype, &ptype_all, list) {
+		if (!ptype->dev) {
+			if (pt_prev)
+				ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = ptype;
+		}
+	}
+
+	/*
+	 * Deliver to wildcard ptype_all protocol handlers.
+	 */

  	type = skb->protocol;
  	list_for_each_entry_rcu(ptype,
  			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type &&
-		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
+		if (ptype->type == type && !ptype->dev) {
  			if (pt_prev)
  				ret = deliver_skb(skb, pt_prev, orig_dev);
  			pt_prev = ptype;