diff mbox

[v2,net] bonding: Fix stacked device detection in arp monitoring

Message ID 1399470461-22213-1-git-send-email-vyasevic@redhat.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Vlad Yasevich May 7, 2014, 1:47 p.m. UTC
Prior to commit fbd929f2dce460456807a51e18d623db3db9f077
	bonding: support QinQ for bond arp interval

the arp monitoring code allowed for proper detection of devices
stacked on top of vlans.  Since the above commit, the
code can still detect a device stacked on top of single
vlan, but not a device stacked on top of Q-in-Q configuration.
The search will only set the inner vlan tag if the route
device is the vlan device.  However, this is not always the
case, as it is possible to extend the stacked configuration.

With this patch it is possible to provision devices on
top Q-in-Q vlan configuration that should be used as
a source of ARP monitoring information.

For example:
ip link add link bond0 vlan10 type vlan proto 802.1q id 10
ip link add link vlan10 vlan100 type vlan proto 802.1q id 100
ip link add link vlan100 type macvlan

Note:  This patch limites the number of stacked VLANs to 2,
just like before.  The original, however had another issue
in that if we had more then 2 levels of VLANs, we would end
up generating incorrectly tagged traffic.  This is no longer
possible.

Fixes: fbd929f2dce460456807a51e18d623db3db9f077 (bonding: support QinQ for bond arp interval)
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@redhat.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: Ding Tianhong <dingtianhong@huawei.com>
CC: Patric McHardy <kaber@trash.net>
Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
---
v2->v1:
* Changed the function name to better describe what the function is doing.
  We are not just finding the stack of vlan devices, we are also verifything
  the path between the bonding device and the route output device.
* Added some more commenets about what the function is doing.
* Fixed an issue with multiple peer vlans.
* Removed all occurances of 'inner' and 'outer' and replaced it with tag
  array.

 drivers/net/bonding/bond_main.c | 114 ++++++++++++++++++----------------------
 1 file changed, 52 insertions(+), 62 deletions(-)

Comments

Jay Vosburgh May 7, 2014, 4:43 p.m. UTC | #1
Vlad Yasevich <vyasevic@redhat.com> wrote:

>Prior to commit fbd929f2dce460456807a51e18d623db3db9f077
>	bonding: support QinQ for bond arp interval
>
>the arp monitoring code allowed for proper detection of devices
>stacked on top of vlans.  Since the above commit, the
>code can still detect a device stacked on top of single
>vlan, but not a device stacked on top of Q-in-Q configuration.
>The search will only set the inner vlan tag if the route
>device is the vlan device.  However, this is not always the
>case, as it is possible to extend the stacked configuration.
>
>With this patch it is possible to provision devices on
>top Q-in-Q vlan configuration that should be used as
>a source of ARP monitoring information.
>
>For example:
>ip link add link bond0 vlan10 type vlan proto 802.1q id 10
>ip link add link vlan10 vlan100 type vlan proto 802.1q id 100
>ip link add link vlan100 type macvlan
>
>Note:  This patch limites the number of stacked VLANs to 2,
>just like before.  The original, however had another issue
>in that if we had more then 2 levels of VLANs, we would end
>up generating incorrectly tagged traffic.  This is no longer
>possible.
>
>Fixes: fbd929f2dce460456807a51e18d623db3db9f077 (bonding: support QinQ for bond arp interval)
>CC: Jay Vosburgh <j.vosburgh@gmail.com>
>CC: Veaceslav Falico <vfalico@redhat.com>
>CC: Andy Gospodarek <andy@greyhouse.net>
>CC: Ding Tianhong <dingtianhong@huawei.com>
>CC: Patric McHardy <kaber@trash.net>
>Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
>---
>v2->v1:
>* Changed the function name to better describe what the function is doing.
>  We are not just finding the stack of vlan devices, we are also verifything
>  the path between the bonding device and the route output device.
>* Added some more commenets about what the function is doing.
>* Fixed an issue with multiple peer vlans.
>* Removed all occurances of 'inner' and 'outer' and replaced it with tag
>  array.

	I think you may have misunderstood my prior comment; I meant
that I liked the "inner" and "outer" names better than "tag[0]" and
"tag[1]".

	I did notice that the inner and outer parameters could be
removed from bond_arp_send as well, but, again, I found the "inner" and
"outer" names more descriptive than tag[0] or tag[1]; perhaps a #define
for the magic numbers (0 = "outer", 1 = "inner" and 2 = "max nesting"),
or at least a comment that says straight up "tag[0] is the outer tag,
tag[1] is the inner tag (if there are two tags)" is in order.

> drivers/net/bonding/bond_main.c | 114 ++++++++++++++++++----------------------
> 1 file changed, 52 insertions(+), 62 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 9d08e00..f592f96 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2126,8 +2126,7 @@ static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
>  */
> static void bond_arp_send(struct net_device *slave_dev, int arp_op,
> 			  __be32 dest_ip, __be32 src_ip,
>-			  struct bond_vlan_tag *inner,
>-			  struct bond_vlan_tag *outer)
>+			  struct bond_vlan_tag *tags)
> {
> 	struct sk_buff *skb;
> 
>@@ -2141,12 +2140,12 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
> 		net_err_ratelimited("ARP packet allocation failed\n");
> 		return;
> 	}
>-	if (outer->vlan_id) {
>-		if (inner->vlan_id) {
>+	if (tags[0].vlan_id) {
>+		if (tags[1].vlan_id) {
> 			pr_debug("inner tag: proto %X vid %X\n",
>-				 ntohs(inner->vlan_proto), inner->vlan_id);
>-			skb = __vlan_put_tag(skb, inner->vlan_proto,
>-					     inner->vlan_id);
>+				 ntohs(tags[1].vlan_proto), tags[1].vlan_id);
>+			skb = __vlan_put_tag(skb, tags[1].vlan_proto,
>+					     tags[1].vlan_id);
> 			if (!skb) {
> 				net_err_ratelimited("failed to insert inner VLAN tag\n");
> 				return;
>@@ -2154,8 +2153,8 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
> 		}
> 
> 		pr_debug("outer reg: proto %X vid %X\n",
>-			 ntohs(outer->vlan_proto), outer->vlan_id);
>-		skb = vlan_put_tag(skb, outer->vlan_proto, outer->vlan_id);
>+			 ntohs(tags[0].vlan_proto), tags[0].vlan_id);
>+		skb = vlan_put_tag(skb, tags[0].vlan_proto, tags[0].vlan_id);
> 		if (!skb) {
> 			net_err_ratelimited("failed to insert outer VLAN tag\n");
> 			return;
>@@ -2164,22 +2163,52 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
> 	arp_xmit(skb);
> }
> 
>+/* Check to make sure that @end device is stacked on top of the @start
>+ * device.  Invofrmation about any intervening vlans are saved into
>+ * the @tag array.  @idx parametet specifies how many vlans deep we are
>+ * are currently looking. We currently only support 2 levels of vlan stacking.
>+ * Return true if we have a valid stacking configuration.  Otherwise false.
>+ */

	Spelling nits: "Information" and "parameter".

>+static bool bond_check_path(struct net_device *start, struct net_device *end,
>+			    struct bond_vlan_tag *tag, int idx)
>+{
>+	struct net_device *upper;
>+	struct list_head  *iter;
>+
>+	/* We do not support more then 2 levels of VLAN nesting */
>+	if (idx >= 2)
>+		return false;
>+
>+	netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
>+		if (is_vlan_dev(upper)) {
>+			tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
>+			tag[idx].vlan_id = vlan_dev_vlan_id(upper);
>+		}
>+		if (upper == end)
>+			return true;
>+
>+		/* Look at the devices list  of 'upper' only if it is a
>+		 * vlan device.
>+		 */
>+		if (is_vlan_dev(upper) &&
>+		    bond_check_path(upper, end, tag, idx+1))
>+			return true;

	This may or may not be a realistic configuration, but will this
function traverse correctly if there is some other device type between
the two vlans?  E.g., eth0 -> bond0 -> vlan100 -> bridge -> vlan200,
where "vlan200" is the "end" device holding the IP address from the
route lookup.  It need not be a bridge in there, but I think this would
be a legal configuration.

	-J

>+	}
>+	return false;
>+}
>+
> 
> static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
> {
>-	struct net_device *upper, *vlan_upper;
>-	struct list_head *iter, *vlan_iter;
> 	struct rtable *rt;
>-	struct bond_vlan_tag inner, outer;
>+	struct bond_vlan_tag tags[2];
> 	__be32 *targets = bond->params.arp_targets, addr;
> 	int i;
>+	bool ret;
> 
> 	for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) {
> 		pr_debug("basa: target %pI4\n", &targets[i]);
>-		inner.vlan_proto = 0;
>-		inner.vlan_id = 0;
>-		outer.vlan_proto = 0;
>-		outer.vlan_id = 0;
>+		memset(tags, 0, sizeof(tags));
> 
> 		/* Find out through which dev should the packet go */
> 		rt = ip_route_output(dev_net(bond->dev), targets[i], 0,
>@@ -2192,7 +2221,8 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
> 				net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n",
> 						     bond->dev->name,
> 						     &targets[i]);
>-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], 0, &inner, &outer);
>+			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>+				      0, tags);
> 			continue;
> 		}
> 
>@@ -2201,52 +2231,12 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
> 			goto found;
> 
> 		rcu_read_lock();
>-		/* first we search only for vlan devices. for every vlan
>-		 * found we verify its upper dev list, searching for the
>-		 * rt->dst.dev. If found we save the tag of the vlan and
>-		 * proceed to send the packet.
>-		 */
>-		netdev_for_each_all_upper_dev_rcu(bond->dev, vlan_upper,
>-						  vlan_iter) {
>-			if (!is_vlan_dev(vlan_upper))
>-				continue;
>-
>-			if (vlan_upper == rt->dst.dev) {
>-				outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper);
>-				outer.vlan_id = vlan_dev_vlan_id(vlan_upper);
>-				rcu_read_unlock();
>-				goto found;
>-			}
>-			netdev_for_each_all_upper_dev_rcu(vlan_upper, upper,
>-							  iter) {
>-				if (upper == rt->dst.dev) {
>-					/* If the upper dev is a vlan dev too,
>-					 *  set the vlan tag to inner tag.
>-					 */
>-					if (is_vlan_dev(upper)) {
>-						inner.vlan_proto = vlan_dev_vlan_proto(upper);
>-						inner.vlan_id = vlan_dev_vlan_id(upper);
>-					}
>-					outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper);
>-					outer.vlan_id = vlan_dev_vlan_id(vlan_upper);
>-					rcu_read_unlock();
>-					goto found;
>-				}
>-			}
>-		}
>-
>-		/* if the device we're looking for is not on top of any of
>-		 * our upper vlans, then just search for any dev that
>-		 * matches, and in case it's a vlan - save the id
>-		 */
>-		netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) {
>-			if (upper == rt->dst.dev) {
>-				rcu_read_unlock();
>-				goto found;
>-			}
>-		}
>+		ret = bond_check_path(bond->dev, rt->dst.dev, tags, 0);
> 		rcu_read_unlock();
> 
>+		if (ret)
>+			goto found;
>+
> 		/* Not our device - skip */
> 		pr_debug("%s: no path to arp_ip_target %pI4 via rt.dev %s\n",
> 			 bond->dev->name, &targets[i],
>@@ -2259,7 +2249,7 @@ found:
> 		addr = bond_confirm_addr(rt->dst.dev, targets[i], 0);
> 		ip_rt_put(rt);
> 		bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>-			      addr, &inner, &outer);
>+			      addr, tags);
> 	}
> }
> 
>-- 
>1.9.0
>

---
	-Jay Vosburgh, jay.vosburgh@canonical.com
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vlad Yasevich May 7, 2014, 5:08 p.m. UTC | #2
On 05/07/2014 12:43 PM, Jay Vosburgh wrote:
> Vlad Yasevich <vyasevic@redhat.com> wrote:
> 
>> Prior to commit fbd929f2dce460456807a51e18d623db3db9f077
>> 	bonding: support QinQ for bond arp interval
>>
>> the arp monitoring code allowed for proper detection of devices
>> stacked on top of vlans.  Since the above commit, the
>> code can still detect a device stacked on top of single
>> vlan, but not a device stacked on top of Q-in-Q configuration.
>> The search will only set the inner vlan tag if the route
>> device is the vlan device.  However, this is not always the
>> case, as it is possible to extend the stacked configuration.
>>
>> With this patch it is possible to provision devices on
>> top Q-in-Q vlan configuration that should be used as
>> a source of ARP monitoring information.
>>
>> For example:
>> ip link add link bond0 vlan10 type vlan proto 802.1q id 10
>> ip link add link vlan10 vlan100 type vlan proto 802.1q id 100
>> ip link add link vlan100 type macvlan
>>
>> Note:  This patch limites the number of stacked VLANs to 2,
>> just like before.  The original, however had another issue
>> in that if we had more then 2 levels of VLANs, we would end
>> up generating incorrectly tagged traffic.  This is no longer
>> possible.
>>
>> Fixes: fbd929f2dce460456807a51e18d623db3db9f077 (bonding: support QinQ for bond arp interval)
>> CC: Jay Vosburgh <j.vosburgh@gmail.com>
>> CC: Veaceslav Falico <vfalico@redhat.com>
>> CC: Andy Gospodarek <andy@greyhouse.net>
>> CC: Ding Tianhong <dingtianhong@huawei.com>
>> CC: Patric McHardy <kaber@trash.net>
>> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
>> ---
>> v2->v1:
>> * Changed the function name to better describe what the function is doing.
>>  We are not just finding the stack of vlan devices, we are also verifything
>>  the path between the bonding device and the route output device.
>> * Added some more commenets about what the function is doing.
>> * Fixed an issue with multiple peer vlans.
>> * Removed all occurances of 'inner' and 'outer' and replaced it with tag
>>  array.
> 
> 	I think you may have misunderstood my prior comment; I meant
> that I liked the "inner" and "outer" names better than "tag[0]" and
> "tag[1]".

Oh, sorry.  I misunderstood.  I can certainly maintain inner/outer
connotation, but it seem a bit silly.  We have a stack of vlan devices
so we'll apply them as as stack as well.

I actually debated about making it generic and not limiting us to a
max depth of 2 as there is nothing in the vlan implementation that
limits the user from configuring a stack of more then 2 deep.

Even 802.1ad spec doesn't explicitly limit the number of vlan headers to
2, and once you do that, the concept of inner/outer goes away.

> 
> 	I did notice that the inner and outer parameters could be
> removed from bond_arp_send as well, but, again, I found the "inner" and
> "outer" names more descriptive than tag[0] or tag[1]; perhaps a #define
> for the magic numbers (0 = "outer", 1 = "inner" and 2 = "max nesting"),
> or at least a comment that says straight up "tag[0] is the outer tag,
> tag[1] is the inner tag (if there are two tags)" is in order.
> 
>> drivers/net/bonding/bond_main.c | 114 ++++++++++++++++++----------------------
>> 1 file changed, 52 insertions(+), 62 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>> index 9d08e00..f592f96 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -2126,8 +2126,7 @@ static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
>>  */
>> static void bond_arp_send(struct net_device *slave_dev, int arp_op,
>> 			  __be32 dest_ip, __be32 src_ip,
>> -			  struct bond_vlan_tag *inner,
>> -			  struct bond_vlan_tag *outer)
>> +			  struct bond_vlan_tag *tags)
>> {
>> 	struct sk_buff *skb;
>>
>> @@ -2141,12 +2140,12 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
>> 		net_err_ratelimited("ARP packet allocation failed\n");
>> 		return;
>> 	}
>> -	if (outer->vlan_id) {
>> -		if (inner->vlan_id) {
>> +	if (tags[0].vlan_id) {
>> +		if (tags[1].vlan_id) {
>> 			pr_debug("inner tag: proto %X vid %X\n",
>> -				 ntohs(inner->vlan_proto), inner->vlan_id);
>> -			skb = __vlan_put_tag(skb, inner->vlan_proto,
>> -					     inner->vlan_id);
>> +				 ntohs(tags[1].vlan_proto), tags[1].vlan_id);
>> +			skb = __vlan_put_tag(skb, tags[1].vlan_proto,
>> +					     tags[1].vlan_id);
>> 			if (!skb) {
>> 				net_err_ratelimited("failed to insert inner VLAN tag\n");
>> 				return;
>> @@ -2154,8 +2153,8 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
>> 		}
>>
>> 		pr_debug("outer reg: proto %X vid %X\n",
>> -			 ntohs(outer->vlan_proto), outer->vlan_id);
>> -		skb = vlan_put_tag(skb, outer->vlan_proto, outer->vlan_id);
>> +			 ntohs(tags[0].vlan_proto), tags[0].vlan_id);
>> +		skb = vlan_put_tag(skb, tags[0].vlan_proto, tags[0].vlan_id);
>> 		if (!skb) {
>> 			net_err_ratelimited("failed to insert outer VLAN tag\n");
>> 			return;
>> @@ -2164,22 +2163,52 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
>> 	arp_xmit(skb);
>> }
>>
>> +/* Check to make sure that @end device is stacked on top of the @start
>> + * device.  Invofrmation about any intervening vlans are saved into
>> + * the @tag array.  @idx parametet specifies how many vlans deep we are
>> + * are currently looking. We currently only support 2 levels of vlan stacking.
>> + * Return true if we have a valid stacking configuration.  Otherwise false.
>> + */
> 
> 	Spelling nits: "Information" and "parameter".
> 
>> +static bool bond_check_path(struct net_device *start, struct net_device *end,
>> +			    struct bond_vlan_tag *tag, int idx)
>> +{
>> +	struct net_device *upper;
>> +	struct list_head  *iter;
>> +
>> +	/* We do not support more then 2 levels of VLAN nesting */
>> +	if (idx >= 2)
>> +		return false;
>> +
>> +	netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
>> +		if (is_vlan_dev(upper)) {
>> +			tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
>> +			tag[idx].vlan_id = vlan_dev_vlan_id(upper);
>> +		}
>> +		if (upper == end)
>> +			return true;
>> +
>> +		/* Look at the devices list  of 'upper' only if it is a
>> +		 * vlan device.
>> +		 */
>> +		if (is_vlan_dev(upper) &&
>> +		    bond_check_path(upper, end, tag, idx+1))
>> +			return true;
> 
> 	This may or may not be a realistic configuration, but will this
> function traverse correctly if there is some other device type between
> the two vlans?  E.g., eth0 -> bond0 -> vlan100 -> bridge -> vlan200,
> where "vlan200" is the "end" device holding the IP address from the
> route lookup.  It need not be a bridge in there, but I think this would
> be a legal configuration.

Yes.  I verified that it works.  The reason is that we are traversing
the all_adj_list.upper list which contains all of the upper devices at
each level.  So, at vlan100 level, we will see vlan200 and all will be
well.

-vlad

> 
> 	-J
> 
>> +	}
>> +	return false;
>> +}
>> +
>>
>> static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
>> {
>> -	struct net_device *upper, *vlan_upper;
>> -	struct list_head *iter, *vlan_iter;
>> 	struct rtable *rt;
>> -	struct bond_vlan_tag inner, outer;
>> +	struct bond_vlan_tag tags[2];
>> 	__be32 *targets = bond->params.arp_targets, addr;
>> 	int i;
>> +	bool ret;
>>
>> 	for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) {
>> 		pr_debug("basa: target %pI4\n", &targets[i]);
>> -		inner.vlan_proto = 0;
>> -		inner.vlan_id = 0;
>> -		outer.vlan_proto = 0;
>> -		outer.vlan_id = 0;
>> +		memset(tags, 0, sizeof(tags));
>>
>> 		/* Find out through which dev should the packet go */
>> 		rt = ip_route_output(dev_net(bond->dev), targets[i], 0,
>> @@ -2192,7 +2221,8 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
>> 				net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n",
>> 						     bond->dev->name,
>> 						     &targets[i]);
>> -			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], 0, &inner, &outer);
>> +			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>> +				      0, tags);
>> 			continue;
>> 		}
>>
>> @@ -2201,52 +2231,12 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
>> 			goto found;
>>
>> 		rcu_read_lock();
>> -		/* first we search only for vlan devices. for every vlan
>> -		 * found we verify its upper dev list, searching for the
>> -		 * rt->dst.dev. If found we save the tag of the vlan and
>> -		 * proceed to send the packet.
>> -		 */
>> -		netdev_for_each_all_upper_dev_rcu(bond->dev, vlan_upper,
>> -						  vlan_iter) {
>> -			if (!is_vlan_dev(vlan_upper))
>> -				continue;
>> -
>> -			if (vlan_upper == rt->dst.dev) {
>> -				outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper);
>> -				outer.vlan_id = vlan_dev_vlan_id(vlan_upper);
>> -				rcu_read_unlock();
>> -				goto found;
>> -			}
>> -			netdev_for_each_all_upper_dev_rcu(vlan_upper, upper,
>> -							  iter) {
>> -				if (upper == rt->dst.dev) {
>> -					/* If the upper dev is a vlan dev too,
>> -					 *  set the vlan tag to inner tag.
>> -					 */
>> -					if (is_vlan_dev(upper)) {
>> -						inner.vlan_proto = vlan_dev_vlan_proto(upper);
>> -						inner.vlan_id = vlan_dev_vlan_id(upper);
>> -					}
>> -					outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper);
>> -					outer.vlan_id = vlan_dev_vlan_id(vlan_upper);
>> -					rcu_read_unlock();
>> -					goto found;
>> -				}
>> -			}
>> -		}
>> -
>> -		/* if the device we're looking for is not on top of any of
>> -		 * our upper vlans, then just search for any dev that
>> -		 * matches, and in case it's a vlan - save the id
>> -		 */
>> -		netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) {
>> -			if (upper == rt->dst.dev) {
>> -				rcu_read_unlock();
>> -				goto found;
>> -			}
>> -		}
>> +		ret = bond_check_path(bond->dev, rt->dst.dev, tags, 0);
>> 		rcu_read_unlock();
>>
>> +		if (ret)
>> +			goto found;
>> +
>> 		/* Not our device - skip */
>> 		pr_debug("%s: no path to arp_ip_target %pI4 via rt.dev %s\n",
>> 			 bond->dev->name, &targets[i],
>> @@ -2259,7 +2249,7 @@ found:
>> 		addr = bond_confirm_addr(rt->dst.dev, targets[i], 0);
>> 		ip_rt_put(rt);
>> 		bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>> -			      addr, &inner, &outer);
>> +			      addr, tags);
>> 	}
>> }
>>
>> -- 
>> 1.9.0
>>
> 
> ---
> 	-Jay Vosburgh, jay.vosburgh@canonical.com
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Veaceslav Falico May 7, 2014, 5:49 p.m. UTC | #3
On Wed, May 07, 2014 at 01:08:09PM -0400, Vlad Yasevich wrote:
...snip...
>Yes.  I verified that it works.  The reason is that we are traversing
>the all_adj_list.upper list which contains all of the upper devices at
>each level.  So, at vlan100 level, we will see vlan200 and all will be
>well.

Hrm, two scenarios, with the following config:

bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP

end == whatever3_IP

1) IIRC there are no guarantees on order of all_upper list, so, if
whatever3_IP dev is the first in the list - bond_check_patch() will return
true right away.

I might be wrong, though, it's 8PM and my brain farts when trying to look
at that code.

That's fixable (from first sight) by introducing a variable upper_found:

+       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
...
+               if (upper == end)
+                       upper_found = true;
...
+       }
+       return upper_found;

This way it will first try to go through all nested vlans and, if none
found, will return true. Basically, "return upper_found (=true)" has the
meaning that upper was found and there are no vlans in between.

The "wrong" order might be achieved by creating a bridge for whatever2,
creating and linking vlan2 and whatever3_IP, and only *after* that adding
vlan1 as a port to bridge whatever2.

2) (with the fix from #1 applied)

bond_check_path start==bond0 idx=0
finds vlan1, tag[0] set, recursion start==vlan1 idx=1
bond_check_path start==vlan1 idx=1
finds vlan2, tag[1] set, recursion start==vlan2 idx=2
returns right away with false as idx >= 2.

That's wrong as there might be vlan3 on top of whatever2, and tag[1] might
be set to it, whilst vlan3 has nothing to do with whatever3_IP.

The fix here would be to halt on idx > 2, or, rather, to NOT use
recursion/vlan checks if idx == 2, thus leaving us with only upper_found
logic.

So, the end patch (not compiled, not tested...) would look something like
(only the bond_check_path() is changed and copied here, everything else
remains the same):

+       bool upper_found = false;
+
+       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
+               if (upper == end)
+                       upper_found = true;
+
+               if (idx < 2 && is_vlan_dev(upper) &&
+                   bond_check_path(upper, end, tag, idx+1)) {
+                       tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
+                       tag[idx].vlan_id = vlan_dev_vlan_id(upper);
+                       return true;
+               }
+       }
+       return upper_found;

This way we'll collect the maximum ammount of stacked vlans on our trip
from bond0 to whatever3_IP (the limit is 2, however it might be removed
afterwards if needed, will still work with long enough tag[]).

Hope that makes at least some sense.

>
>-vlad
...snip...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Veaceslav Falico May 7, 2014, 6:11 p.m. UTC | #4
On Wed, May 07, 2014 at 07:49:10PM +0200, Veaceslav Falico wrote:
>On Wed, May 07, 2014 at 01:08:09PM -0400, Vlad Yasevich wrote:
>...snip...
>>Yes.  I verified that it works.  The reason is that we are traversing
>>the all_adj_list.upper list which contains all of the upper devices at
>>each level.  So, at vlan100 level, we will see vlan200 and all will be
>>well.
>
>Hrm, two scenarios, with the following config:
>
>bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>
>end == whatever3_IP
>
...snip...
>
>So, the end patch (not compiled, not tested...) would look something like
>(only the bond_check_path() is changed and copied here, everything else
>remains the same):
>
>+       bool upper_found = false;
>+
>+       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
>+               if (upper == end)
>+                       upper_found = true;
>+
>+               if (idx < 2 && is_vlan_dev(upper) &&
>+                   bond_check_path(upper, end, tag, idx+1)) {
>+                       tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
>+                       tag[idx].vlan_id = vlan_dev_vlan_id(upper);
>+                       return true;

Actually, screw that, we might find here the vlan2 first and end up with
only 1 vlan (vlan2, skipping vlan1).

The way to fix this might be to get the most "lengthy" path of vlans, as
in:

+ * Return the maximum length of stacked vlans + device found, 0 if the end
+ * device is not found.
+ */
+static int bond_check_path(struct net_device *start, struct net_device *end,
+                          struct bond_vlan_tag *tag, int idx)
+{
+       struct net_device *upper;
+       struct list_head  *iter;
+       int length, max_length = 0;
+
+       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
+               if (upper == end && !max_length)
+                       max_length = 1;
+
+               if (idx < 2 && is_vlan_dev(upper)) {
+                       length = bond_check_path(upper, end, tag, idx + 1);
+
+                       if (max_length < length + 1) {
+                               tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
+                               tag[idx].vlan_id = vlan_dev_vlan_id(upper);
+                               max_length = length + 1;
+                       }
+               }
+       }
+       return max_length;
+}

Hope that helps.

>+               }
>+       }
>+       return upper_found;
...snip...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vlad Yasevich May 7, 2014, 6:47 p.m. UTC | #5
On 05/07/2014 02:11 PM, Veaceslav Falico wrote:
> On Wed, May 07, 2014 at 07:49:10PM +0200, Veaceslav Falico wrote:
>> On Wed, May 07, 2014 at 01:08:09PM -0400, Vlad Yasevich wrote:
>> ...snip...
>>> Yes.  I verified that it works.  The reason is that we are traversing
>>> the all_adj_list.upper list which contains all of the upper devices at
>>> each level.  So, at vlan100 level, we will see vlan200 and all will be
>>> well.
>>
>> Hrm, two scenarios, with the following config:
>>
>> bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>>
>> end == whatever3_IP
>>
> ...snip...
>>
>> So, the end patch (not compiled, not tested...) would look something like
>> (only the bond_check_path() is changed and copied here, everything else
>> remains the same):
>>
>> +       bool upper_found = false;
>> +
>> +       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
>> +               if (upper == end)
>> +                       upper_found = true;
>> +
>> +               if (idx < 2 && is_vlan_dev(upper) &&
>> +                   bond_check_path(upper, end, tag, idx+1)) {
>> +                       tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
>> +                       tag[idx].vlan_id = vlan_dev_vlan_id(upper);
>> +                       return true;
> 
> Actually, screw that, we might find here the vlan2 first and end up with
> only 1 vlan (vlan2, skipping vlan1).
> 

hmm..  I am not sure that's actually possible...

__netdev_adjacent_dev_insert() will always insert at the tail.  If you
have a stack of vlans:
   vlan1 (vid 10, 802.1Q)
     |
     v
   vlan2 (vid 20, 802.1AD)
     |
     v
   bond0

then vlan1 will always be at the end of the list, and after vlan2.
Even if we remove things, the higher the device, the later it will be in
the list.

So, in the event of the configuration:
bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP

waterver_IP will be last in the list due to list_add_tail_rcu()
usage.

Did I misunderstand something in the code? I can't seem to make it
the above config fail in my tests.

-vlad
> The way to fix this might be to get the most "lengthy" path of vlans, as
> in:
> 
> + * Return the maximum length of stacked vlans + device found, 0 if the end
> + * device is not found.
> + */
> +static int bond_check_path(struct net_device *start, struct net_device
> *end,
> +                          struct bond_vlan_tag *tag, int idx)
> +{
> +       struct net_device *upper;
> +       struct list_head  *iter;
> +       int length, max_length = 0;
> +
> +       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
> +               if (upper == end && !max_length)
> +                       max_length = 1;
> +
> +               if (idx < 2 && is_vlan_dev(upper)) {
> +                       length = bond_check_path(upper, end, tag, idx + 1);
> +
> +                       if (max_length < length + 1) {
> +                               tag[idx].vlan_proto =
> vlan_dev_vlan_proto(upper);
> +                               tag[idx].vlan_id = vlan_dev_vlan_id(upper);
> +                               max_length = length + 1;
> +                       }
> +               }
> +       }
> +       return max_length;
> +}
> 
> Hope that helps.
> 
>> +               }
>> +       }
>> +       return upper_found;
> ...snip...

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Veaceslav Falico May 7, 2014, 6:59 p.m. UTC | #6
On Wed, May 07, 2014 at 02:47:36PM -0400, Vlad Yasevich wrote:
>On 05/07/2014 02:11 PM, Veaceslav Falico wrote:
>> On Wed, May 07, 2014 at 07:49:10PM +0200, Veaceslav Falico wrote:
>>> On Wed, May 07, 2014 at 01:08:09PM -0400, Vlad Yasevich wrote:
>>> ...snip...
>>>> Yes.  I verified that it works.  The reason is that we are traversing
>>>> the all_adj_list.upper list which contains all of the upper devices at
>>>> each level.  So, at vlan100 level, we will see vlan200 and all will be
>>>> well.
>>>
>>> Hrm, two scenarios, with the following config:
>>>
>>> bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>>>
>>> end == whatever3_IP
>>>
>> ...snip...
>>>
>>> So, the end patch (not compiled, not tested...) would look something like
>>> (only the bond_check_path() is changed and copied here, everything else
>>> remains the same):
>>>
>>> +       bool upper_found = false;
>>> +
>>> +       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
>>> +               if (upper == end)
>>> +                       upper_found = true;
>>> +
>>> +               if (idx < 2 && is_vlan_dev(upper) &&
>>> +                   bond_check_path(upper, end, tag, idx+1)) {
>>> +                       tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
>>> +                       tag[idx].vlan_id = vlan_dev_vlan_id(upper);
>>> +                       return true;
>>
>> Actually, screw that, we might find here the vlan2 first and end up with
>> only 1 vlan (vlan2, skipping vlan1).
>>
>
>hmm..  I am not sure that's actually possible...
>
>__netdev_adjacent_dev_insert() will always insert at the tail.  If you
>have a stack of vlans:
>   vlan1 (vid 10, 802.1Q)
>     |
>     v
>   vlan2 (vid 20, 802.1AD)
>     |
>     v
>   bond0
>
>then vlan1 will always be at the end of the list, and after vlan2.
>Even if we remove things, the higher the device, the later it will be in
>the list.
>
>So, in the event of the configuration:
>bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>
>waterver_IP will be last in the list due to list_add_tail_rcu()
>usage.

Yeah, you're right, sorry for misunderstanding (I've been tricked by the
master thing, but it doesn't actually add anything to all_upper).

Anyway, so the only concern is:

bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
					\-> vlan3
bond_check_path start==bond0 idx=0
finds vlan1, tag[0] set, recursion start==vlan1 idx=1
\->
	bond_check_path start==vlan1 idx=1
	finds vlan2, tag[1] set, recursion start==vlan2 idx=2
	\->	returns right away with false as idx >= 2.

	finds vlan3 (!!!) that isn't related with whatever_IP, tag[1] set with the
	wrong vlan, recursion start==vlan3 idx=2
	\->	return right away with false as idx >= 2.

	finds whatever3_IP, returns true.
returns true

and this way we end up with vlan1 -> vlan3, instead of vlan1 -> vlan2.

Can be fixed by that "don't go deeper/populate tag[] if idx == 2" trick.

>
>Did I misunderstand something in the code? I can't seem to make it
>the above config fail in my tests.
>
>-vlad
>> The way to fix this might be to get the most "lengthy" path of vlans, as
>> in:
>>
>> + * Return the maximum length of stacked vlans + device found, 0 if the end
>> + * device is not found.
>> + */
>> +static int bond_check_path(struct net_device *start, struct net_device
>> *end,
>> +                          struct bond_vlan_tag *tag, int idx)
>> +{
>> +       struct net_device *upper;
>> +       struct list_head  *iter;
>> +       int length, max_length = 0;
>> +
>> +       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
>> +               if (upper == end && !max_length)
>> +                       max_length = 1;
>> +
>> +               if (idx < 2 && is_vlan_dev(upper)) {
>> +                       length = bond_check_path(upper, end, tag, idx + 1);
>> +
>> +                       if (max_length < length + 1) {
>> +                               tag[idx].vlan_proto =
>> vlan_dev_vlan_proto(upper);
>> +                               tag[idx].vlan_id = vlan_dev_vlan_id(upper);
>> +                               max_length = length + 1;
>> +                       }
>> +               }
>> +       }
>> +       return max_length;
>> +}
>>
>> Hope that helps.
>>
>>> +               }
>>> +       }
>>> +       return upper_found;
>> ...snip...
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jay Vosburgh May 7, 2014, 7:40 p.m. UTC | #7
Veaceslav Falico <vfalico@redhat.com> wrote:

>On Wed, May 07, 2014 at 02:47:36PM -0400, Vlad Yasevich wrote:
>>On 05/07/2014 02:11 PM, Veaceslav Falico wrote:
>>> On Wed, May 07, 2014 at 07:49:10PM +0200, Veaceslav Falico wrote:
>>>> On Wed, May 07, 2014 at 01:08:09PM -0400, Vlad Yasevich wrote:
>>>> ...snip...
>>>>> Yes.  I verified that it works.  The reason is that we are traversing
>>>>> the all_adj_list.upper list which contains all of the upper devices at
>>>>> each level.  So, at vlan100 level, we will see vlan200 and all will be
>>>>> well.
>>>>
>>>> Hrm, two scenarios, with the following config:
>>>>
>>>> bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>>>>
>>>> end == whatever3_IP
>>>>
>>> ...snip...
>>>>
>>>> So, the end patch (not compiled, not tested...) would look something like
>>>> (only the bond_check_path() is changed and copied here, everything else
>>>> remains the same):
>>>>
>>>> +       bool upper_found = false;
>>>> +
>>>> +       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
>>>> +               if (upper == end)
>>>> +                       upper_found = true;
>>>> +
>>>> +               if (idx < 2 && is_vlan_dev(upper) &&
>>>> +                   bond_check_path(upper, end, tag, idx+1)) {
>>>> +                       tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
>>>> +                       tag[idx].vlan_id = vlan_dev_vlan_id(upper);
>>>> +                       return true;
>>>
>>> Actually, screw that, we might find here the vlan2 first and end up with
>>> only 1 vlan (vlan2, skipping vlan1).
>>>
>>
>>hmm..  I am not sure that's actually possible...
>>
>>__netdev_adjacent_dev_insert() will always insert at the tail.  If you
>>have a stack of vlans:
>>   vlan1 (vid 10, 802.1Q)
>>     |
>>     v
>>   vlan2 (vid 20, 802.1AD)
>>     |
>>     v
>>   bond0
>>
>>then vlan1 will always be at the end of the list, and after vlan2.
>>Even if we remove things, the higher the device, the later it will be in
>>the list.
>>
>>So, in the event of the configuration:
>>bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>>
>>waterver_IP will be last in the list due to list_add_tail_rcu()
>>usage.
>
>Yeah, you're right, sorry for misunderstanding (I've been tricked by the
>master thing, but it doesn't actually add anything to all_upper).
>
>Anyway, so the only concern is:
>
>bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>					\-> vlan3
>bond_check_path start==bond0 idx=0
>finds vlan1, tag[0] set, recursion start==vlan1 idx=1
>\->
>	bond_check_path start==vlan1 idx=1
>	finds vlan2, tag[1] set, recursion start==vlan2 idx=2
>	\->	returns right away with false as idx >= 2.
>
>	finds vlan3 (!!!) that isn't related with whatever_IP, tag[1] set with the
>	wrong vlan, recursion start==vlan3 idx=2
>	\->	return right away with false as idx >= 2.
>
>	finds whatever3_IP, returns true.
>returns true
>
>and this way we end up with vlan1 -> vlan3, instead of vlan1 -> vlan2.
>
>Can be fixed by that "don't go deeper/populate tag[] if idx == 2" trick.

	How, exactly?  I'm not sure I see a way to do this correctly
other than walking the tree layout (and I use that term loosely, given
that the upper dev list is not an actual tree data structure in the
usual sense) from bond0 -> whatever3_IP and pulling the VLANs from that
path.

	Being later in the device list (and thus higher) doesn't
automatically make it part of the path, e.g.,

bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3
		\---> vlan3 -> whatever4_IP

	if we're looking for whatever4_IP.  Are we guaranteed to hit
whatever4_IP and finish up before vlan2 ends up in a tag[], even if
whatever4_IP was the last interface added to the pile?

	-J

>>Did I misunderstand something in the code? I can't seem to make it
>>the above config fail in my tests.
>>
>>-vlad
>>> The way to fix this might be to get the most "lengthy" path of vlans, as
>>> in:
>>>
>>> + * Return the maximum length of stacked vlans + device found, 0 if the end
>>> + * device is not found.
>>> + */
>>> +static int bond_check_path(struct net_device *start, struct net_device
>>> *end,
>>> +                          struct bond_vlan_tag *tag, int idx)
>>> +{
>>> +       struct net_device *upper;
>>> +       struct list_head  *iter;
>>> +       int length, max_length = 0;
>>> +
>>> +       netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
>>> +               if (upper == end && !max_length)
>>> +                       max_length = 1;
>>> +
>>> +               if (idx < 2 && is_vlan_dev(upper)) {
>>> +                       length = bond_check_path(upper, end, tag, idx + 1);
>>> +
>>> +                       if (max_length < length + 1) {
>>> +                               tag[idx].vlan_proto =
>>> vlan_dev_vlan_proto(upper);
>>> +                               tag[idx].vlan_id = vlan_dev_vlan_id(upper);
>>> +                               max_length = length + 1;
>>> +                       }
>>> +               }
>>> +       }
>>> +       return max_length;
>>> +}
>>>
>>> Hope that helps.
>>>
>>>> +               }
>>>> +       }
>>>> +       return upper_found;
>>> ...snip...

---
	-Jay Vosburgh, jay.vosburgh@canonical.com
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Veaceslav Falico May 7, 2014, 8:10 p.m. UTC | #8
On Wed, May 07, 2014 at 08:59:37PM +0200, Veaceslav Falico wrote:
>Anyway, so the only concern is:
>
>bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>					\-> vlan3
>bond_check_path start==bond0 idx=0
>finds vlan1, tag[0] set, recursion start==vlan1 idx=1
>\->
>	bond_check_path start==vlan1 idx=1
>	finds vlan2, tag[1] set, recursion start==vlan2 idx=2
>	\->	returns right away with false as idx >= 2.
>
>	finds vlan3 (!!!) that isn't related with whatever_IP, tag[1] set with the
>	wrong vlan, recursion start==vlan3 idx=2
>	\->	return right away with false as idx >= 2.
>
>	finds whatever3_IP, returns true.
>returns true

Here's a proof of concept (btw, if somebody wants this script - I can put
it somewhere), with your patch applied:

bond0 is configured in mode 1 with eth2 being the primary slave, and (one of)
the arp_ip_targets is 192.168.10.254 (bond2's subnet /24).

Initially everything works:

darkmag:~#/home/vfalico/tmp/asciinet/netstruct.pl
+---------------+              +-------------+           +--------------+
|     bond1     |  neighbour   |  bond1.11   |  master   |    bond2     |
|  192.168.2.1  | ------------ |             | <-------- | 192.168.10.1 |
+---------------+              +-------------+           +--------------+
   |
   | master
   v
+---------------+              +-------------+           +--------------+           +------+
|  bridge0.15   |  neighbour   |   bridge0   |  master   |    bond0     |  master   | eth2 |
|               | ------------ | 192.168.3.1 | --------> |              | --------> |      |
+---------------+              +-------------+           +--------------+           +------+
                                                            |
                                                            | master
                                                            v
+---------------+                                        +--------------+
|    dummy0     |                                        |     eth0     |
+---------------+                                        +--------------+
...

tcpdump from eth2:
21:57:54.990521 00:22:64:b9:87:05 > Broadcast, ethertype 802.1Q (0x8100), length 50: vlan 15, p 0, ethertype 802.1Q, vlan 11, p 0, ethertype ARP, Request who-has 192.168.10.254 tell 192.168.10.1, length 28

so, tag 11 (via bond1.11) and tag 15 (via bridge0.15), all fine.

Now:

darkmag:~#echo -bond2 > /sys/class/net/bonding_masters 
darkmag:~#vconfig add bond1 12
Added VLAN with VID == 12 to IF -:bond1:-
darkmag:~#ifup bond2
Determining if ip address 192.168.10.1 is already in use for device bond2...
darkmag:~#/home/vfalico/tmp/asciinet/netstruct.pl
+-------------+           +---------------+              +----------+           +--------------+
| bridge0.15  |  master   |     bond1     |  neighbour   | bond1.11 |  master   |    bond2     |
|             | <-------- |  192.168.2.1  | ------------ |          | <-------- | 192.168.10.1 |
+-------------+           +---------------+              +----------+           +--------------+
   |                         |
   | neighbour               | neighbour
   |                         |
+-------------+           +---------------+
|   bridge0   |           |   bond1.12    |
| 192.168.3.1 |           |               |
+-------------+           +---------------+
   |
   | master
   v
+-------------+  master   +---------------+
|    bond0    | --------> |     eth2      |
+-------------+           +---------------+
   |
   | master
   v
+-------------+           +---------------+
|    eth0     |           |    dummy0     |
+-------------+           +---------------+
...

and tcpdump shows:

21:58:44.136522 00:22:64:b9:87:05 > Broadcast, ethertype 802.1Q (0x8100), length 50: vlan 15, p 0, ethertype 802.1Q, vlan 12, p 0, ethertype ARP, Request who-has 192.168.10.254 tell 192.168.10.1, length 28

Notice vlan 12 instead of vlan 11.

So I guess that, till the end, we indeed can't guarantee the ordering and should,
actually, go via "the longest" route...

Hope that helps.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ding Tianhong May 8, 2014, 4:25 a.m. UTC | #9
On 2014/5/8 4:10, Veaceslav Falico wrote:
> On Wed, May 07, 2014 at 08:59:37PM +0200, Veaceslav Falico wrote:
>> Anyway, so the only concern is:
>>
>> bond0 -> whatever1 -> vlan1 -> whatever2 -> vlan2 -> whatever3_IP
>>                     \-> vlan3
>> bond_check_path start==bond0 idx=0
>> finds vlan1, tag[0] set, recursion start==vlan1 idx=1
>> \->
>>     bond_check_path start==vlan1 idx=1
>>     finds vlan2, tag[1] set, recursion start==vlan2 idx=2
>>     \->    returns right away with false as idx >= 2.
>>
>>     finds vlan3 (!!!) that isn't related with whatever_IP, tag[1] set with the
>>     wrong vlan, recursion start==vlan3 idx=2
>>     \->    return right away with false as idx >= 2.
>>
>>     finds whatever3_IP, returns true.
>> returns true
> 
> Here's a proof of concept (btw, if somebody wants this script - I can put
> it somewhere), with your patch applied:
> 
> bond0 is configured in mode 1 with eth2 being the primary slave, and (one of)
> the arp_ip_targets is 192.168.10.254 (bond2's subnet /24).
> 
> Initially everything works:
> 
> darkmag:~#/home/vfalico/tmp/asciinet/netstruct.pl
> +---------------+              +-------------+           +--------------+
> |     bond1     |  neighbour   |  bond1.11   |  master   |    bond2     |
> |  192.168.2.1  | ------------ |             | <-------- | 192.168.10.1 |
> +---------------+              +-------------+           +--------------+
>   |
>   | master
>   v
> +---------------+              +-------------+           +--------------+           +------+
> |  bridge0.15   |  neighbour   |   bridge0   |  master   |    bond0     |  master   | eth2 |
> |               | ------------ | 192.168.3.1 | --------> |              | --------> |      |
> +---------------+              +-------------+           +--------------+           +------+
>                                                            |
>                                                            | master
>                                                            v
> +---------------+                                        +--------------+
> |    dummy0     |                                        |     eth0     |
> +---------------+                                        +--------------+
> ...
> 
> tcpdump from eth2:
> 21:57:54.990521 00:22:64:b9:87:05 > Broadcast, ethertype 802.1Q (0x8100), length 50: vlan 15, p 0, ethertype 802.1Q, vlan 11, p 0, ethertype ARP, Request who-has 192.168.10.254 tell 192.168.10.1, length 28
> 
> so, tag 11 (via bond1.11) and tag 15 (via bridge0.15), all fine.
> 
> Now:
> 
> darkmag:~#echo -bond2 > /sys/class/net/bonding_masters darkmag:~#vconfig add bond1 12
> Added VLAN with VID == 12 to IF -:bond1:-
> darkmag:~#ifup bond2
> Determining if ip address 192.168.10.1 is already in use for device bond2...
> darkmag:~#/home/vfalico/tmp/asciinet/netstruct.pl
> +-------------+           +---------------+              +----------+           +--------------+
> | bridge0.15  |  master   |     bond1     |  neighbour   | bond1.11 |  master   |    bond2     |
> |             | <-------- |  192.168.2.1  | ------------ |          | <-------- | 192.168.10.1 |
> +-------------+           +---------------+              +----------+           +--------------+
>   |                         |
>   | neighbour               | neighbour
>   |                         |
> +-------------+           +---------------+
> |   bridge0   |           |   bond1.12    |
> | 192.168.3.1 |           |               |
> +-------------+           +---------------+
>   |
>   | master
>   v
> +-------------+  master   +---------------+
> |    bond0    | --------> |     eth2      |
> +-------------+           +---------------+
>   |
>   | master
>   v
> +-------------+           +---------------+
> |    eth0     |           |    dummy0     |
> +-------------+           +---------------+
> ...
> 
> and tcpdump shows:
> 
> 21:58:44.136522 00:22:64:b9:87:05 > Broadcast, ethertype 802.1Q (0x8100), length 50: vlan 15, p 0, ethertype 802.1Q, vlan 12, p 0, ethertype ARP, Request who-has 192.168.10.254 tell 192.168.10.1, length 28
> 
> Notice vlan 12 instead of vlan 11.
> 
> So I guess that, till the end, we indeed can't guarantee the ordering and should,
> actually, go via "the longest" route...
> 
> Hope that helps.
> 
> .

Great analysis, But it is really hard to read, I think I reproduce this situation more simple.

ip link add link eth10 eth10.10 type vlan proto 802.1ad id 10
ip link add link eth10.10 eth10.10.100 type vlan proto 802.1q id 100

ifconfig eth10.10.100 192.168.10.37
ping 192.168.10.32

then show the tcpdump:
tcpdump: listening on eth10, link-type EN10MB (Ethernet), capture size 96 bytes
00:47:48.940194 54:89:98:f3:f1:b7 (oui Unknown) > Broadcast, ethertype Unknown (0x88a8), length 50:
        0x0000:  ffff ffff ffff 5489 98f3 f1b7 88a8 000a  ......T.........
        0x0010:  8100 0064 0806 0001 0800 0604 0001 5489  ...d..........T.
        0x0020:  98f3 f1b7 c0a8 0a25 0000 0000 0000 c0a8  .......%........
        0x0030:  0a20

obviously the first tag is 88a8 and the second tag is 8100,

then I add more vlan dev:

ip link add link eth10.10.100 eth10.10.100.200 type vlan proto 802.1q id 100.200

then show the tcpdump:

00:43:50.541466 54:89:98:f3:f1:b7 (oui Unknown) > Broadcast, ethertype Unknown (0x88a8), length 54:
        0x0000:  ffff ffff ffff 5489 98f3 f1b7 88a8 000a  ......T.........
        0x0010:  8100 0064 8100 00c8 0806 0001 0800 0604  ...d............
        0x0020:  0001 5489 98f3 f1b7 c0a8 1e25 0000 0000  ..T........%....
        0x0030:  0000 c0a8 1e20   

obviously the first tag is 88a8, the second is 8100, id is 100, and the third tag is 8100, id is 200.

So I think in this patch, the idx should not only limit to 2.

Ding




> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9d08e00..f592f96 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2126,8 +2126,7 @@  static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
  */
 static void bond_arp_send(struct net_device *slave_dev, int arp_op,
 			  __be32 dest_ip, __be32 src_ip,
-			  struct bond_vlan_tag *inner,
-			  struct bond_vlan_tag *outer)
+			  struct bond_vlan_tag *tags)
 {
 	struct sk_buff *skb;
 
@@ -2141,12 +2140,12 @@  static void bond_arp_send(struct net_device *slave_dev, int arp_op,
 		net_err_ratelimited("ARP packet allocation failed\n");
 		return;
 	}
-	if (outer->vlan_id) {
-		if (inner->vlan_id) {
+	if (tags[0].vlan_id) {
+		if (tags[1].vlan_id) {
 			pr_debug("inner tag: proto %X vid %X\n",
-				 ntohs(inner->vlan_proto), inner->vlan_id);
-			skb = __vlan_put_tag(skb, inner->vlan_proto,
-					     inner->vlan_id);
+				 ntohs(tags[1].vlan_proto), tags[1].vlan_id);
+			skb = __vlan_put_tag(skb, tags[1].vlan_proto,
+					     tags[1].vlan_id);
 			if (!skb) {
 				net_err_ratelimited("failed to insert inner VLAN tag\n");
 				return;
@@ -2154,8 +2153,8 @@  static void bond_arp_send(struct net_device *slave_dev, int arp_op,
 		}
 
 		pr_debug("outer reg: proto %X vid %X\n",
-			 ntohs(outer->vlan_proto), outer->vlan_id);
-		skb = vlan_put_tag(skb, outer->vlan_proto, outer->vlan_id);
+			 ntohs(tags[0].vlan_proto), tags[0].vlan_id);
+		skb = vlan_put_tag(skb, tags[0].vlan_proto, tags[0].vlan_id);
 		if (!skb) {
 			net_err_ratelimited("failed to insert outer VLAN tag\n");
 			return;
@@ -2164,22 +2163,52 @@  static void bond_arp_send(struct net_device *slave_dev, int arp_op,
 	arp_xmit(skb);
 }
 
+/* Check to make sure that @end device is stacked on top of the @start
+ * device.  Invofrmation about any intervening vlans are saved into
+ * the @tag array.  @idx parametet specifies how many vlans deep we are
+ * are currently looking. We currently only support 2 levels of vlan stacking.
+ * Return true if we have a valid stacking configuration.  Otherwise false.
+ */
+static bool bond_check_path(struct net_device *start, struct net_device *end,
+			    struct bond_vlan_tag *tag, int idx)
+{
+	struct net_device *upper;
+	struct list_head  *iter;
+
+	/* We do not support more then 2 levels of VLAN nesting */
+	if (idx >= 2)
+		return false;
+
+	netdev_for_each_all_upper_dev_rcu(start, upper, iter) {
+		if (is_vlan_dev(upper)) {
+			tag[idx].vlan_proto = vlan_dev_vlan_proto(upper);
+			tag[idx].vlan_id = vlan_dev_vlan_id(upper);
+		}
+		if (upper == end)
+			return true;
+
+		/* Look at the devices list  of 'upper' only if it is a
+		 * vlan device.
+		 */
+		if (is_vlan_dev(upper) &&
+		    bond_check_path(upper, end, tag, idx+1))
+			return true;
+	}
+	return false;
+}
+
 
 static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
 {
-	struct net_device *upper, *vlan_upper;
-	struct list_head *iter, *vlan_iter;
 	struct rtable *rt;
-	struct bond_vlan_tag inner, outer;
+	struct bond_vlan_tag tags[2];
 	__be32 *targets = bond->params.arp_targets, addr;
 	int i;
+	bool ret;
 
 	for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) {
 		pr_debug("basa: target %pI4\n", &targets[i]);
-		inner.vlan_proto = 0;
-		inner.vlan_id = 0;
-		outer.vlan_proto = 0;
-		outer.vlan_id = 0;
+		memset(tags, 0, sizeof(tags));
 
 		/* Find out through which dev should the packet go */
 		rt = ip_route_output(dev_net(bond->dev), targets[i], 0,
@@ -2192,7 +2221,8 @@  static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
 				net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n",
 						     bond->dev->name,
 						     &targets[i]);
-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], 0, &inner, &outer);
+			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
+				      0, tags);
 			continue;
 		}
 
@@ -2201,52 +2231,12 @@  static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
 			goto found;
 
 		rcu_read_lock();
-		/* first we search only for vlan devices. for every vlan
-		 * found we verify its upper dev list, searching for the
-		 * rt->dst.dev. If found we save the tag of the vlan and
-		 * proceed to send the packet.
-		 */
-		netdev_for_each_all_upper_dev_rcu(bond->dev, vlan_upper,
-						  vlan_iter) {
-			if (!is_vlan_dev(vlan_upper))
-				continue;
-
-			if (vlan_upper == rt->dst.dev) {
-				outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper);
-				outer.vlan_id = vlan_dev_vlan_id(vlan_upper);
-				rcu_read_unlock();
-				goto found;
-			}
-			netdev_for_each_all_upper_dev_rcu(vlan_upper, upper,
-							  iter) {
-				if (upper == rt->dst.dev) {
-					/* If the upper dev is a vlan dev too,
-					 *  set the vlan tag to inner tag.
-					 */
-					if (is_vlan_dev(upper)) {
-						inner.vlan_proto = vlan_dev_vlan_proto(upper);
-						inner.vlan_id = vlan_dev_vlan_id(upper);
-					}
-					outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper);
-					outer.vlan_id = vlan_dev_vlan_id(vlan_upper);
-					rcu_read_unlock();
-					goto found;
-				}
-			}
-		}
-
-		/* if the device we're looking for is not on top of any of
-		 * our upper vlans, then just search for any dev that
-		 * matches, and in case it's a vlan - save the id
-		 */
-		netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) {
-			if (upper == rt->dst.dev) {
-				rcu_read_unlock();
-				goto found;
-			}
-		}
+		ret = bond_check_path(bond->dev, rt->dst.dev, tags, 0);
 		rcu_read_unlock();
 
+		if (ret)
+			goto found;
+
 		/* Not our device - skip */
 		pr_debug("%s: no path to arp_ip_target %pI4 via rt.dev %s\n",
 			 bond->dev->name, &targets[i],
@@ -2259,7 +2249,7 @@  found:
 		addr = bond_confirm_addr(rt->dst.dev, targets[i], 0);
 		ip_rt_put(rt);
 		bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
-			      addr, &inner, &outer);
+			      addr, tags);
 	}
 }