Message ID | 528D5980.3040309@oracle.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On 2013/11/21 8:53, rama nichanamatlu wrote: > During the creation of VLAN's atop bonding the underlying interfaces are made part of VLAN's, and at the same bonding driver gets aware that VLAN's exists above it and hence would consult IP routing for every ARP to be sent to determine the route which tells bonding driver the correct VLAN tag to attach to the outgoing ARP packet. But, during the VLAN creation when vlan driver puts the underlying interface into default vlan and then actual vlan, in-between this if bonding driver consults the IP for a route, IP fails to provide a correct route and upon which bonding driver drops the ARP packet. ARP monitor when it > comes around next time, sees no ARP response and fails-over to the next available slave. Consulting for a IP route, ip_route_output(),happens in bond_arp_send_all(). > > To prevent this false fail-over, when bonding driver fails to send an ARP out it marks in its private structure, bonding{}, not to expect an ARP response, when ARP monitor comes around next time ARP sending will be tried again. > > Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan intf. All virtual interfaces created at boot time. > please reorganize the changelog, it is too log for a line, please make it in 80 chats. > Orabug: 17172660 > Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com> > Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com> > --- > drivers/net/bonding/bond_main.c | 13 ++++++++----- > drivers/net/bonding/bonding.h | 1 + > 2 files changed, 9 insertions(+), 5 deletions(-) > > diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c > index dde6b4a..d475161 100644 > --- a/drivers/net/bonding/bond_main.c > +++ b/drivers/net/bonding/bond_main.c > @@ -2661,7 +2661,7 @@ static int bond_has_this_ip(struct bonding *bond, __be32 ip) > * switches in VLAN mode (especially if ports are configured as > * "native" to a VLAN) might not pass non-tagged frames. > */ > -static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short vlan_id) > +static void bond_arp_send(struct bonding *bond, struct net_device *slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short vlan_id) no need to add bond, the bond could be get from slave, see slave->bond Ding > { > struct sk_buff *skb; > @@ -2683,6 +2683,7 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_ > } > } > arp_xmit(skb); > + bond->arp_sent=true; > } > @@ -2700,7 +2701,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) > pr_debug("basa: target %x\n", targets[i]); > if (!bond->vlgrp) { > pr_debug("basa: empty vlan: arp_send\n"); > - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], > + bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > bond->master_ip, 0); > continue; > } > @@ -2726,7 +2727,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) > if (rt->dst.dev == bond->dev) { > ip_rt_put(rt); > pr_debug("basa: rtdev == bond->dev: arp_send\n"); > - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], > + bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > bond->master_ip, 0); > continue; > } > @@ -2744,7 +2745,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) > if (vlan_id) { > ip_rt_put(rt); > - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], > + bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > vlan->vlan_ip, vlan_id); > continue; > } > @@ -3206,7 +3207,7 @@ void bond_activebackup_arp_mon(struct work_struct *work) > should_notify_peers = bond_should_notify_peers(bond); > - if (bond_ab_arp_inspect(bond, delta_in_ticks)) { > + if (bond->arp_sent && bond_ab_arp_inspect(bond, delta_in_ticks)) { > read_unlock(&bond->lock); > rtnl_lock(); > read_lock(&bond->lock); > @@ -3218,6 +3219,7 @@ void bond_activebackup_arp_mon(struct work_struct *work) > read_lock(&bond->lock); > } > + bond->arp_sent=false; > bond_ab_arp_probe(bond); > re_arm: > @@ -4425,6 +4427,7 @@ static void bond_setup(struct net_device *bond_dev) > bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM); > bond_dev->features |= bond_dev->hw_features; > + bond->arp_sent=false; > } > static void bond_work_cancel_all(struct bonding *bond) > diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h > index e9a3c56..3878bbd 100644 > --- a/drivers/net/bonding/bonding.h > +++ b/drivers/net/bonding/bonding.h > @@ -253,6 +253,7 @@ struct bonding { > /* debugging suport via debugfs */ > struct dentry *debug_dir; > #endif /* CONFIG_DEBUG_FS */ > + bool arp_sent; > }; > #define bond_slave_get_rcu(dev) \ -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote: >During the creation of VLAN's atop bonding the underlying interfaces are >made part of VLAN's, and at the same bonding driver gets aware that VLAN's >exists above it and hence would consult IP routing for every ARP to be >sent to determine the route which tells bonding driver the correct VLAN >tag to attach to the outgoing ARP packet. But, during the VLAN creation >when vlan driver puts the underlying interface into default vlan and then >actual vlan, in-between this if bonding driver consults the IP for a >route, IP fails to provide a correct route and upon which bonding driver >drops the ARP packet. ARP monitor when it >comes around next time, sees no ARP response and fails-over to the next >available slave. Consulting for a IP route, ip_route_output(),happens in >bond_arp_send_all(). > >To prevent this false fail-over, when bonding driver fails to send an ARP >out it marks in its private structure, bonding{}, not to expect an ARP >response, when ARP monitor comes around next time ARP sending will be >tried again. > >Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan >intf. All virtual interfaces created at boot time. First, this patch appears to be for an older kernel, as the current mainline code is substantially different (e.g., master_ip is no longer used). Second, won't this methodology mask legitimate failures, such as when a single arp_ip_target specifies a destination that is not ever reachable? I.e., would specifying a permanently unreachable IP address as the arp_ip_target cause all slaves to always stay up (because no ARPs will ever be sent), even if no ARP replies are ever received? -J >Orabug: 17172660 >Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com> >Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com> >--- > drivers/net/bonding/bond_main.c | 13 ++++++++----- > drivers/net/bonding/bonding.h | 1 + > 2 files changed, 9 insertions(+), 5 deletions(-) > >diff --git a/drivers/net/bonding/bond_main.c >b/drivers/net/bonding/bond_main.c >index dde6b4a..d475161 100644 >--- a/drivers/net/bonding/bond_main.c >+++ b/drivers/net/bonding/bond_main.c >@@ -2661,7 +2661,7 @@ static int bond_has_this_ip(struct bonding *bond, >__be32 ip) > * switches in VLAN mode (especially if ports are configured as > * "native" to a VLAN) might not pass non-tagged frames. > */ >-static void bond_arp_send(struct net_device *slave_dev, int arp_op, >__be32 dest_ip, __be32 src_ip, unsigned short vlan_id) >+static void bond_arp_send(struct bonding *bond, struct net_device >*slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short >vlan_id) > { > struct sk_buff *skb; > @@ -2683,6 +2683,7 @@ static void bond_arp_send(struct net_device >*slave_dev, int arp_op, __be32 dest_ > } > } > arp_xmit(skb); >+ bond->arp_sent=true; > } > @@ -2700,7 +2701,7 @@ static void bond_arp_send_all(struct bonding >*bond, struct slave *slave) > pr_debug("basa: target %x\n", targets[i]); > if (!bond->vlgrp) { > pr_debug("basa: empty vlan: arp_send\n"); >- bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], >+ bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > bond->master_ip, 0); > continue; > } >@@ -2726,7 +2727,7 @@ static void bond_arp_send_all(struct bonding *bond, >struct slave *slave) > if (rt->dst.dev == bond->dev) { > ip_rt_put(rt); > pr_debug("basa: rtdev == bond->dev: arp_send\n"); >- bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], >+ bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > bond->master_ip, 0); > continue; > } >@@ -2744,7 +2745,7 @@ static void bond_arp_send_all(struct bonding *bond, >struct slave *slave) > if (vlan_id) { > ip_rt_put(rt); >- bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], >+ bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > vlan->vlan_ip, vlan_id); > continue; > } >@@ -3206,7 +3207,7 @@ void bond_activebackup_arp_mon(struct work_struct >*work) > should_notify_peers = bond_should_notify_peers(bond); > - if (bond_ab_arp_inspect(bond, delta_in_ticks)) { >+ if (bond->arp_sent && bond_ab_arp_inspect(bond, delta_in_ticks)) { > read_unlock(&bond->lock); > rtnl_lock(); > read_lock(&bond->lock); >@@ -3218,6 +3219,7 @@ void bond_activebackup_arp_mon(struct work_struct >*work) > read_lock(&bond->lock); > } > + bond->arp_sent=false; > bond_ab_arp_probe(bond); > re_arm: >@@ -4425,6 +4427,7 @@ static void bond_setup(struct net_device *bond_dev) > bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM); > bond_dev->features |= bond_dev->hw_features; >+ bond->arp_sent=false; > } > static void bond_work_cancel_all(struct bonding *bond) >diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h >index e9a3c56..3878bbd 100644 >--- a/drivers/net/bonding/bonding.h >+++ b/drivers/net/bonding/bonding.h >@@ -253,6 +253,7 @@ struct bonding { > /* debugging suport via debugfs */ > struct dentry *debug_dir; > #endif /* CONFIG_DEBUG_FS */ >+ bool arp_sent; > }; > #define bond_slave_get_rcu(dev) \ >-- >1.8.2.1 --- -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/20/2013 5:18 PM, Jay Vosburgh wrote: > rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote: > >> During the creation of VLAN's atop bonding the underlying interfaces are >> made part of VLAN's, and at the same bonding driver gets aware that VLAN's >> exists above it and hence would consult IP routing for every ARP to be >> sent to determine the route which tells bonding driver the correct VLAN >> tag to attach to the outgoing ARP packet. But, during the VLAN creation >> when vlan driver puts the underlying interface into default vlan and then >> actual vlan, in-between this if bonding driver consults the IP for a >> route, IP fails to provide a correct route and upon which bonding driver >> drops the ARP packet. ARP monitor when it >> comes around next time, sees no ARP response and fails-over to the next >> available slave. Consulting for a IP route, ip_route_output(),happens in >> bond_arp_send_all(). >> >> To prevent this false fail-over, when bonding driver fails to send an ARP >> out it marks in its private structure, bonding{}, not to expect an ARP >> response, when ARP monitor comes around next time ARP sending will be >> tried again. >> >> Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan >> intf. All virtual interfaces created at boot time. > > First, this patch appears to be for an older kernel, as the > current mainline code is substantially different (e.g., master_ip is no > longer used). > > Second, won't this methodology mask legitimate failures, such as > when a single arp_ip_target specifies a destination that is not ever > reachable? I.e., would specifying a permanently unreachable IP address > as the arp_ip_target cause all slaves to always stay up (because no ARPs > will ever be sent), even if no ARP replies are ever received? > > -J > Thank U. I agree with your rationale. Would keep a slave falsely up but traffic might flow. And true that, it is not what we are looking for. We can try a different approach too, which we used to fix a false fail-over in MTU changing case where the device interface takes time to change the device MTU. And in the mean time bonding was failing over. What we did to fix was to stop the ARP monitoring, bond_change_mtu(), and restart it when NETDEV_CHANGE from slave is handled, bond_slave_netdev_event(). Not sure if this can used for vlan case, as mtu thing is event driven. >> Orabug: 17172660 >> Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com> >> Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Thought will write to you again Jay. On 11/20/2013 5:18 PM, Jay Vosburgh wrote: > rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote: > >> During the creation of VLAN's atop bonding the underlying interfaces are >> made part of VLAN's, and at the same bonding driver gets aware that VLAN's >> exists above it and hence would consult IP routing for every ARP to be >> sent to determine the route which tells bonding driver the correct VLAN >> tag to attach to the outgoing ARP packet. But, during the VLAN creation >> when vlan driver puts the underlying interface into default vlan and then >> actual vlan, in-between this if bonding driver consults the IP for a >> route, IP fails to provide a correct route and upon which bonding driver >> drops the ARP packet. ARP monitor when it >> comes around next time, sees no ARP response and fails-over to the next >> available slave. Consulting for a IP route, ip_route_output(),happens in >> bond_arp_send_all(). >> >> To prevent this false fail-over, when bonding driver fails to send an ARP >> out it marks in its private structure, bonding{}, not to expect an ARP >> response, when ARP monitor comes around next time ARP sending will be >> tried again. >> >> Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan >> intf. All virtual interfaces created at boot time. > > First, this patch appears to be for an older kernel, as the > current mainline code is substantially different (e.g., master_ip is no > longer used). > > Second, won't this methodology mask legitimate failures, such as > when a single arp_ip_target specifies a destination that is not ever > reachable? I.e., would specifying a permanently unreachable IP address > as the arp_ip_target cause all slaves to always stay up (because no ARPs > will ever be sent), even if no ARP replies are ever received? > > An unreachable arp_target, a.b.c.d in a way that ip_route_output() fails as it cant even know which local interface to broadcast arp out. Is that what you mean? Either with this change or not, bonding interface is dead.Right? Without this change, it flip-flops (cycling) between slaves and with this change stays on one slave. > -J > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote: >During the creation of VLAN's atop bonding the underlying interfaces >are made part of VLAN's, and at the same bonding driver gets aware >that VLAN's exists above it and hence would consult IP routing for >every ARP to be sent to determine the route which tells bonding >driver the correct VLAN tag to attach to the outgoing ARP packet. >But, during the VLAN creation when vlan driver puts the underlying >interface into default vlan and then actual vlan, in-between this if >bonding driver consults the IP for a route, IP fails to provide a >correct route and upon which bonding driver drops the ARP packet. ARP >monitor when it >comes around next time, sees no ARP response and fails-over to the >next available slave. Consulting for a IP route, >ip_route_output(),happens in bond_arp_send_all(). bonding works as expected - nothing to fix here. And even as a workaround/hack - I'm not sure we need that to suppress one failover *only* when vlan is added on top. > >To prevent this false fail-over, when bonding driver fails to send an >ARP out it marks in its private structure, bonding{}, not to expect >an ARP response, when ARP monitor comes around next time ARP sending >will be tried again. > >Extensively tested in a VM environment; sr-iov intf->bonding >intf->vlan intf. All virtual interfaces created at boot time. > >Orabug: 17172660 >Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com> >Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com> >--- > drivers/net/bonding/bond_main.c | 13 ++++++++----- > drivers/net/bonding/bonding.h | 1 + > 2 files changed, 9 insertions(+), 5 deletions(-) > >diff --git a/drivers/net/bonding/bond_main.c >b/drivers/net/bonding/bond_main.c >index dde6b4a..d475161 100644 >--- a/drivers/net/bonding/bond_main.c >+++ b/drivers/net/bonding/bond_main.c >@@ -2661,7 +2661,7 @@ static int bond_has_this_ip(struct bonding >*bond, __be32 ip) > * switches in VLAN mode (especially if ports are configured as > * "native" to a VLAN) might not pass non-tagged frames. > */ >-static void bond_arp_send(struct net_device *slave_dev, int arp_op, >__be32 dest_ip, __be32 src_ip, unsigned short vlan_id) >+static void bond_arp_send(struct bonding *bond, struct net_device >*slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short >vlan_id) > { > struct sk_buff *skb; > @@ -2683,6 +2683,7 @@ static void bond_arp_send(struct net_device >*slave_dev, int arp_op, __be32 dest_ > } > } > arp_xmit(skb); >+ bond->arp_sent=true; > } > @@ -2700,7 +2701,7 @@ static void bond_arp_send_all(struct bonding >*bond, struct slave *slave) > pr_debug("basa: target %x\n", targets[i]); > if (!bond->vlgrp) { > pr_debug("basa: empty vlan: arp_send\n"); >- bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], >+ bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > bond->master_ip, 0); > continue; > } >@@ -2726,7 +2727,7 @@ static void bond_arp_send_all(struct bonding >*bond, struct slave *slave) > if (rt->dst.dev == bond->dev) { > ip_rt_put(rt); > pr_debug("basa: rtdev == bond->dev: arp_send\n"); >- bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], >+ bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > bond->master_ip, 0); > continue; > } >@@ -2744,7 +2745,7 @@ static void bond_arp_send_all(struct bonding >*bond, struct slave *slave) > if (vlan_id) { > ip_rt_put(rt); >- bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], >+ bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], > vlan->vlan_ip, vlan_id); > continue; > } >@@ -3206,7 +3207,7 @@ void bond_activebackup_arp_mon(struct >work_struct *work) > should_notify_peers = bond_should_notify_peers(bond); > - if (bond_ab_arp_inspect(bond, delta_in_ticks)) { >+ if (bond->arp_sent && bond_ab_arp_inspect(bond, delta_in_ticks)) { > read_unlock(&bond->lock); > rtnl_lock(); > read_lock(&bond->lock); >@@ -3218,6 +3219,7 @@ void bond_activebackup_arp_mon(struct >work_struct *work) > read_lock(&bond->lock); > } > + bond->arp_sent=false; > bond_ab_arp_probe(bond); > re_arm: >@@ -4425,6 +4427,7 @@ static void bond_setup(struct net_device *bond_dev) > bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM); > bond_dev->features |= bond_dev->hw_features; >+ bond->arp_sent=false; > } > static void bond_work_cancel_all(struct bonding *bond) >diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h >index e9a3c56..3878bbd 100644 >--- a/drivers/net/bonding/bonding.h >+++ b/drivers/net/bonding/bonding.h >@@ -253,6 +253,7 @@ struct bonding { > /* debugging suport via debugfs */ > struct dentry *debug_dir; > #endif /* CONFIG_DEBUG_FS */ >+ bool arp_sent; > }; > #define bond_slave_get_rcu(dev) \ >-- >1.8.2.1 > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/21/2013 3:10 AM, Veaceslav Falico wrote: > On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote: >> During the creation of VLAN's atop bonding the underlying interfaces >> are made part of VLAN's, and at the same bonding driver gets aware >> that VLAN's exists above it and hence would consult IP routing for >> every ARP to be sent to determine the route which tells bonding >> driver the correct VLAN tag to attach to the outgoing ARP packet. But, >> during the VLAN creation when vlan driver puts the underlying >> interface into default vlan and then actual vlan, in-between this if >> bonding driver consults the IP for a route, IP fails to provide a >> correct route and upon which bonding driver drops the ARP packet. ARP >> monitor when it >> comes around next time, sees no ARP response and fails-over to the >> next available slave. Consulting for a IP route, >> ip_route_output(),happens in bond_arp_send_all(). > > bonding works as expected - nothing to fix here. And even as a > workaround/hack - I'm not sure we need that to suppress one failover *only* > when vlan is added on top. > >> Thank U. With *out* this change our systems failed system testing, to consistently be on designated primary interface on *every* single reboot. With this change the behavior was as expected even after a few thousand reboots & System testing could move to next level catching an another bug in sr-iov :). And Without, the outcome was less predictable after a reboot and bonding was on a different slave each time. -Rama >> To prevent this false fail-over, when bonding driver fails to send an >> ARP out it marks in its private structure, bonding{}, not to expect >> an ARP response, when ARP monitor comes around next time ARP sending >> will be tried again. >> >> Extensively tested in a VM environment; sr-iov intf->bonding >> intf->vlan intf. All virtual interfaces created at boot time. >> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote: >On 11/21/2013 3:10 AM, Veaceslav Falico wrote: >> On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote: >>> During the creation of VLAN's atop bonding the underlying interfaces >>> are made part of VLAN's, and at the same bonding driver gets aware >>> that VLAN's exists above it and hence would consult IP routing for >>> every ARP to be sent to determine the route which tells bonding >>> driver the correct VLAN tag to attach to the outgoing ARP packet. But, >>> during the VLAN creation when vlan driver puts the underlying >>> interface into default vlan and then actual vlan, in-between this if >>> bonding driver consults the IP for a route, IP fails to provide a >>> correct route and upon which bonding driver drops the ARP packet. ARP >>> monitor when it >>> comes around next time, sees no ARP response and fails-over to the >>> next available slave. Consulting for a IP route, >>> ip_route_output(),happens in bond_arp_send_all(). >> >> bonding works as expected - nothing to fix here. And even as a >> workaround/hack - I'm not sure we need that to suppress one failover *only* >> when vlan is added on top. >> >>> >Thank U. >With *out* this change our systems failed system testing, to >consistently be on designated primary interface on *every* single >reboot. With this change the behavior was as expected even after a few >thousand reboots & System testing could move to next level catching an >another bug in sr-iov :). And Without, the outcome was less predictable >after a reboot and bonding was on a different slave each time. >-Rama By "designated primary" you mean the bonding primary option, correct? If not, does setting primary resolve the problem? If so, you're saying that during the bringup, bonding would end up with a non-primary slave as the active slave? Or that there would be a failover / failback cycle during the bringup due to the lack of VLAN availability? There is already a mechanism in bond_ab_arp_inspect() to give new slaves a grace period before applying link failures: /* * Give slaves 2*delta after being enslaved or made * active. This avoids bouncing, as the last receive * times need a full ARP monitor cycle to be updated. */ if (bond_time_in_interval(bond, slave->jiffies, 2)) continue; If you extend that grace period (the "2", which is in units of the arp_interval), does the problem resolve itself, or is the time window here longer than that? How is the configuration of bonding and the VLANs taking place? I don't think this patch is suitable (because it can mask legitimate failures), but I'm not entirely sure I understand the details of the problem. Is this simply that the arp_ip_target is specified as a VLAN destination signficantly before (meaning perhaps many seconds of real time) the VLAN is configured above bonding, or is it some kind of race condition in the VLAN code? -J >>> To prevent this false fail-over, when bonding driver fails to send an >>> ARP out it marks in its private structure, bonding{}, not to expect >>> an ARP response, when ARP monitor comes around next time ARP sending >>> will be tried again. >>> >>> Extensively tested in a VM environment; sr-iov intf->bonding >>> intf->vlan intf. All virtual interfaces created at boot time. --- -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/21/2013 1:12 PM, Jay Vosburgh wrote: > rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote: > >> On 11/21/2013 3:10 AM, Veaceslav Falico wrote: >>> On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote: >>>> During the creation of VLAN's atop bonding the underlying interfaces >>>> are made part of VLAN's, and at the same bonding driver gets aware >>>> that VLAN's exists above it and hence would consult IP routing for >>>> every ARP to be sent to determine the route which tells bonding >>>> driver the correct VLAN tag to attach to the outgoing ARP packet. But, >>>> during the VLAN creation when vlan driver puts the underlying >>>> interface into default vlan and then actual vlan, in-between this if >>>> bonding driver consults the IP for a route, IP fails to provide a >>>> correct route and upon which bonding driver drops the ARP packet. ARP >>>> monitor when it >>>> comes around next time, sees no ARP response and fails-over to the >>>> next available slave. Consulting for a IP route, >>>> ip_route_output(),happens in bond_arp_send_all(). >>> >>> bonding works as expected - nothing to fix here. And even as a >>> workaround/hack - I'm not sure we need that to suppress one failover *only* >>> when vlan is added on top. >>> >>>> >> Thank U. >> With *out* this change our systems failed system testing, to >> consistently be on designated primary interface on *every* single >> reboot. With this change the behavior was as expected even after a few >> thousand reboots & System testing could move to next level catching an >> another bug in sr-iov :). And Without, the outcome was less predictable >> after a reboot and bonding was on a different slave each time. >> -Rama > > By "designated primary" you mean the bonding primary option, > correct? Yes correct. Bonding primary param is set. ex: primary=eth1 and primary_reselect=2. Hence it is expected to be on primary on every reboot. -Rama >If not, -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote: >On 11/21/2013 1:12 PM, Jay Vosburgh wrote: >> rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote: >> >>> On 11/21/2013 3:10 AM, Veaceslav Falico wrote: >>>> On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote: >>>>> During the creation of VLAN's atop bonding the underlying interfaces >>>>> are made part of VLAN's, and at the same bonding driver gets aware >>>>> that VLAN's exists above it and hence would consult IP routing for >>>>> every ARP to be sent to determine the route which tells bonding >>>>> driver the correct VLAN tag to attach to the outgoing ARP packet. But, >>>>> during the VLAN creation when vlan driver puts the underlying >>>>> interface into default vlan and then actual vlan, in-between this if >>>>> bonding driver consults the IP for a route, IP fails to provide a >>>>> correct route and upon which bonding driver drops the ARP packet. ARP >>>>> monitor when it >>>>> comes around next time, sees no ARP response and fails-over to the >>>>> next available slave. Consulting for a IP route, >>>>> ip_route_output(),happens in bond_arp_send_all(). >>>> >>>> bonding works as expected - nothing to fix here. And even as a >>>> workaround/hack - I'm not sure we need that to suppress one failover *only* >>>> when vlan is added on top. >>>> >>>>> >>> Thank U. >>> With *out* this change our systems failed system testing, to >>> consistently be on designated primary interface on *every* single >>> reboot. With this change the behavior was as expected even after a few >>> thousand reboots & System testing could move to next level catching an >>> another bug in sr-iov :). And Without, the outcome was less predictable >>> after a reboot and bonding was on a different slave each time. >>> -Rama >> >> By "designated primary" you mean the bonding primary option, >> correct? >Yes correct. Bonding primary param is set. >ex: primary=eth1 and primary_reselect=2. >Hence it is expected to be on primary on every reboot. If I set up a basic bonding configuration like: [ eth3, eth4 ] -> bond0 -> bond0.66, with primary=eth3 primary_reselect=2 Then look at dmesg, I see this sequence: The bond is set up first, with an arp_ip_target on a VLAN destination. The slaves are added to the bond. The VLAN interface is configured above the bond, and brought up. The slaves become link up after autonegotiation, the ARP monitor commences, and eth3 is made the active slave. Even if eth4 is set by the bond to be "link status up," eth3 becomes the active slave when it becomes "link status up." What network device are you using for the slaves? Are they virtualized devices of some kind? My suspicion is that Ethernet autonegotiation either does not take place or occurs so quickly that the slaves are carrier up before the VLAN is even added. Can you check your dmesg output for the sequence of events? In my test, I do not see the slaves go "NIC Link is Up 1000 Mbps Full Duplex" until about 3 seconds after the VLAN interface has been configured. -J --- -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/21/2013 6:43 PM, Jay Vosburgh wrote: > rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote: >> Yes correct. Bonding primary param is set. >> ex: primary=eth1 and primary_reselect=2. >> Hence it is expected to be on primary on every reboot. > > If I set up a basic bonding configuration like: > Extremely thankful to U for investigating it to this extent. Need to admit our test requirement here. Reboot test is done 300 times and requirement is all 300 should see bond intf on primary upon boot. What is observed is, reboot test never could achieve this until we put this change. > [ eth3, eth4 ] -> bond0 -> bond0.66, with primary=eth3 primary_reselect=2 > Our config is complex. Here (please ask if not clear): First niC card Intel 82599 sr-iov PF0 -> eth0 [dom0] VF0 -> eth1 [domU] -> bond0 -> bond0.1 VF0 -> eth2 [domU] -> Second niC card Intel 82599 sr-iov PF0 -> eth1 [dom0] PF = Physical Func VF = Virtual Func [are bond slaves] XEN based OVM [Oracle] 2 arp targets configured. A lan switch connected to *each* PF hosts a vlan arp target. AFAIK sr-iov code (can get clarification from Donald/Keller sr-iov owning team, as we did temp fix of another issue in intel sr-iov driver, so we are connected) addition of vlan's to sr-iov functions is *as* simple adding an entry into a table. That is they don't go thru a re-init like an mtu change. Meaning the intf stays UP during that time. *But* takes a little while as this table addition is done by PF driver in dom0 upon request by domU VF driver as there is MBox communication. > Then look at dmesg, I see this sequence: > > The bond is set up first, with an arp_ip_target on a VLAN > destination. The slaves are added to the bond. > > The VLAN interface is configured above the bond, and brought up. > > The slaves become link up after autonegotiation, the ARP monitor > commences, and eth3 is made the active slave. Even if eth4 is set by > the bond to be "link status up," eth3 becomes the active slave when it > becomes "link status up." > > What network device are you using for the slaves? Are they > virtualized devices of some kind? My suspicion is that Ethernet > autonegotiation either does not take place or occurs so quickly that the > slaves are carrier up before the VLAN is even added. > That is so correct of a guess. We never saw a link loss message, neither nic driver nor bonding reporting during vlan addition. Because as carrier is up bond has a curr_active_slave hence tries to arp but fails as ip_route_output() fails. > Can you check your dmesg output for the sequence of events? In > my test, I do not see the slaves go "NIC Link is Up 1000 Mbps Full > Duplex" until about 3 seconds after the VLAN interface has been > configured. > so it means bonding driver would not have had a curr_active_slave for almost 3 secs as none slave would have qualified to become so, hence bonding never even would attempted to send an arp out. What we observed is "no route to arp_ip_target" message. We see 10 of them (5 pairs, as there 2 arp targets and arp interval is 200 msecs) lasting for a 1 sec, then no more error messages. With out this fix, we would see that bonding failover messages in between those messages. Eventually when arp'ing is working bonding would be on some interface, need not be primary. With this fix, we would not see those bonding failover messages, and when arp'ing is working bond is on primary. Requirement of this product being built is to achieve an extremely quick failover, hence 200 msecs arp interval. > > -J > > --- > -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index dde6b4a..d475161 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2661,7 +2661,7 @@ static int bond_has_this_ip(struct bonding *bond, __be32 ip) * switches in VLAN mode (especially if ports are configured as * "native" to a VLAN) might not pass non-tagged frames. */ -static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short vlan_id) +static void bond_arp_send(struct bonding *bond, struct net_device *slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short vlan_id) { struct sk_buff *skb; @@ -2683,6 +2683,7 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_ } } arp_xmit(skb); + bond->arp_sent=true; } @@ -2700,7 +2701,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) pr_debug("basa: target %x\n", targets[i]); if (!bond->vlgrp) { pr_debug("basa: empty vlan: arp_send\n"); - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], + bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], bond->master_ip, 0); continue; } @@ -2726,7 +2727,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) if (rt->dst.dev == bond->dev) { ip_rt_put(rt); pr_debug("basa: rtdev == bond->dev: arp_send\n"); - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], + bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], bond->master_ip, 0); continue; } @@ -2744,7 +2745,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) if (vlan_id) { ip_rt_put(rt); - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], + bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i], vlan->vlan_ip, vlan_id); continue; } @@ -3206,7 +3207,7 @@ void bond_activebackup_arp_mon(struct work_struct *work) should_notify_peers = bond_should_notify_peers(bond); - if (bond_ab_arp_inspect(bond, delta_in_ticks)) { + if (bond->arp_sent && bond_ab_arp_inspect(bond, delta_in_ticks)) { read_unlock(&bond->lock); rtnl_lock(); read_lock(&bond->lock); @@ -3218,6 +3219,7 @@ void bond_activebackup_arp_mon(struct work_struct *work) read_lock(&bond->lock); } + bond->arp_sent=false; bond_ab_arp_probe(bond); re_arm: @@ -4425,6 +4427,7 @@ static void bond_setup(struct net_device *bond_dev) bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM); bond_dev->features |= bond_dev->hw_features; + bond->arp_sent=false; } static void bond_work_cancel_all(struct bonding *bond) diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index e9a3c56..3878bbd 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -253,6 +253,7 @@ struct bonding { /* debugging suport via debugfs */ struct dentry *debug_dir; #endif /* CONFIG_DEBUG_FS */ + bool arp_sent; }; #define bond_slave_get_rcu(dev) \ -- 1.8.2.1