Message ID | 1433990233-958-2-git-send-email-gospo@cumulusnetworks.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek <gospo@cumulusnetworks.com> wrote: > @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force) > dead++; > else if (nexthop_nh->nh_dev == dev && > nexthop_nh->nh_scope != scope) { > - nexthop_nh->nh_flags |= RTNH_F_DEAD; > + switch (event) { > + case NETDEV_DOWN: > + case NETDEV_UNREGISTER: > + nexthop_nh->nh_flags |= RTNH_F_DEAD; > + /* fall through */ > + case NETDEV_CHANGE: > + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; > + break; > + } > #ifdef CONFIG_IP_ROUTE_MULTIPATH > spin_lock_bh(&fib_multipath_lock); > fi->fib_power -= nexthop_nh->nh_power; > @@ -1139,14 +1160,22 @@ int fib_sync_down_dev(struct net_device *dev, int force) > dead++; > } > #ifdef CONFIG_IP_ROUTE_MULTIPATH > - if (force > 1 && nexthop_nh->nh_dev == dev) { > + if (event == NETDEV_UNREGISTER && nexthop_nh->nh_dev == dev) { > dead = fi->fib_nhs; > break; > } > #endif > } endfor_nexthops(fi) > if (dead == fi->fib_nhs) { > - fi->fib_flags |= RTNH_F_DEAD; > + switch (event) { > + case NETDEV_DOWN: > + case NETDEV_UNREGISTER: > + fi->fib_flags |= RTNH_F_DEAD; > + /* fall through */ > + case NETDEV_CHANGE: > + fi->fib_flags |= RTNH_F_LINKDOWN; RTNH_F_LINKDOWN is to mark linkdown nexthop devs....why is the route fi being marked RTNH_F_LINKDOWN? The RTNH_F_LINKDOWN comment says: #define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ It's a per-nh flag, not per-route flag, correct? Can you show an ECMP example with only a subset of the nexthops dev linkdowned? Show the ip route output after going thru some link down/up events on some of the nexthops devs. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 10, 2015 at 07:53:59PM -0700, Scott Feldman wrote: > On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek > <gospo@cumulusnetworks.com> wrote: > > > @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force) > > dead++; > > else if (nexthop_nh->nh_dev == dev && > > nexthop_nh->nh_scope != scope) { > > - nexthop_nh->nh_flags |= RTNH_F_DEAD; > > + switch (event) { > > + case NETDEV_DOWN: > > + case NETDEV_UNREGISTER: > > + nexthop_nh->nh_flags |= RTNH_F_DEAD; > > + /* fall through */ > > + case NETDEV_CHANGE: > > + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; > > + break; > > + } > > #ifdef CONFIG_IP_ROUTE_MULTIPATH > > spin_lock_bh(&fib_multipath_lock); > > fi->fib_power -= nexthop_nh->nh_power; > > @@ -1139,14 +1160,22 @@ int fib_sync_down_dev(struct net_device *dev, int force) > > dead++; > > } > > #ifdef CONFIG_IP_ROUTE_MULTIPATH > > - if (force > 1 && nexthop_nh->nh_dev == dev) { > > + if (event == NETDEV_UNREGISTER && nexthop_nh->nh_dev == dev) { > > dead = fi->fib_nhs; > > break; > > } > > #endif > > } endfor_nexthops(fi) > > if (dead == fi->fib_nhs) { > > - fi->fib_flags |= RTNH_F_DEAD; > > + switch (event) { > > + case NETDEV_DOWN: > > + case NETDEV_UNREGISTER: > > + fi->fib_flags |= RTNH_F_DEAD; > > + /* fall through */ > > + case NETDEV_CHANGE: > > + fi->fib_flags |= RTNH_F_LINKDOWN; > > RTNH_F_LINKDOWN is to mark linkdown nexthop devs....why is the route > fi being marked RTNH_F_LINKDOWN? > > The RTNH_F_LINKDOWN comment says: > > #define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ This is done with the dead flag already. I'm actually following the precedent already set there. > It's a per-nh flag, not per-route flag, correct? > > Can you show an ECMP example with only a subset of the nexthops dev > linkdowned? Show the ip route output after going thru some link > down/up events on some of the nexthops devs. Sure! This is exactly what I've been using for testing. # ip route show 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 90.0.0.0/24 via 70.0.0.2 dev p7p1 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 10 100.0.0.0/24 nexthop via 70.0.0.2 dev p7p1 weight 1 nexthop via 80.0.0.2 dev p8p1 weight 1 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 # # take p8p1 link down # ip route show 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 10 dead linkdown 100.0.0.0/24 nexthop via 70.0.0.2 dev p7p1 weight 1 nexthop via 80.0.0.2 dev p8p1 weight 1 dead linkdown 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 # ip route get 100.0.0.2 100.0.0.2 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache # ip route get 100.0.0.2 100.0.0.2 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache # # take p8p1 link up # ip route show 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 90.0.0.0/24 via 70.0.0.2 dev p7p1 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 10 100.0.0.0/24 nexthop via 70.0.0.2 dev p7p1 weight 1 nexthop via 80.0.0.2 dev p8p1 weight 1 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 # ip route show 100.0.0.2 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache # ip route get 100.0.0.2 100.0.0.2 via 80.0.0.2 dev p8p1 src 80.0.0.1 cache # ip route get 100.0.0.2 100.0.0.2 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache # ip route get 100.0.0.2 100.0.0.2 via 80.0.0.2 dev p8p1 src 80.0.0.1 cache # # you can see the round robin happening # # take all ports p8p1 and p7p1 down # ip route show 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 dead linkdown 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 dead linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 10 dead linkdown 100.0.0.0/24 nexthop via 70.0.0.2 dev p7p1 weight 1 dead linkdown nexthop via 80.0.0.2 dev p8p1 weight 1 dead linkdown 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 # ip route get 100.0.0.2 RTNETLINK answers: Network is unreachable # ip route get 80.0.0.2 RTNETLINK answers: Network is unreachable # ip route get 80.0.0.1 local 80.0.0.1 dev lo src 80.0.0.1 cache <local> # ip route get 70.0.0.1 local 70.0.0.1 dev lo src 70.0.0.1 cache <local> # # local addrs are still reachable -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 10, 2015 at 8:28 PM, Andy Gospodarek <gospo@cumulusnetworks.com> wrote: > On Wed, Jun 10, 2015 at 07:53:59PM -0700, Scott Feldman wrote: >> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek >> <gospo@cumulusnetworks.com> wrote: >> >> > @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force) >> > dead++; >> > else if (nexthop_nh->nh_dev == dev && >> > nexthop_nh->nh_scope != scope) { >> > - nexthop_nh->nh_flags |= RTNH_F_DEAD; >> > + switch (event) { >> > + case NETDEV_DOWN: >> > + case NETDEV_UNREGISTER: >> > + nexthop_nh->nh_flags |= RTNH_F_DEAD; >> > + /* fall through */ >> > + case NETDEV_CHANGE: >> > + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; >> > + break; >> > + } >> > #ifdef CONFIG_IP_ROUTE_MULTIPATH >> > spin_lock_bh(&fib_multipath_lock); >> > fi->fib_power -= nexthop_nh->nh_power; >> > @@ -1139,14 +1160,22 @@ int fib_sync_down_dev(struct net_device *dev, int force) >> > dead++; >> > } >> > #ifdef CONFIG_IP_ROUTE_MULTIPATH >> > - if (force > 1 && nexthop_nh->nh_dev == dev) { >> > + if (event == NETDEV_UNREGISTER && nexthop_nh->nh_dev == dev) { >> > dead = fi->fib_nhs; >> > break; >> > } >> > #endif >> > } endfor_nexthops(fi) >> > if (dead == fi->fib_nhs) { >> > - fi->fib_flags |= RTNH_F_DEAD; >> > + switch (event) { >> > + case NETDEV_DOWN: >> > + case NETDEV_UNREGISTER: >> > + fi->fib_flags |= RTNH_F_DEAD; >> > + /* fall through */ >> > + case NETDEV_CHANGE: >> > + fi->fib_flags |= RTNH_F_LINKDOWN; >> >> RTNH_F_LINKDOWN is to mark linkdown nexthop devs....why is the route >> fi being marked RTNH_F_LINKDOWN? >> >> The RTNH_F_LINKDOWN comment says: >> >> #define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ > > This is done with the dead flag already. I'm actually following the > precedent already set there. > >> It's a per-nh flag, not per-route flag, correct? >> >> Can you show an ECMP example with only a subset of the nexthops dev >> linkdowned? Show the ip route output after going thru some link >> down/up events on some of the nexthops devs. > > Sure! This is exactly what I've been using for testing. > > # ip route show > 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 > 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 > 90.0.0.0/24 via 70.0.0.2 dev p7p1 > 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 10 > 100.0.0.0/24 > nexthop via 70.0.0.2 dev p7p1 weight 1 > nexthop via 80.0.0.2 dev p8p1 weight 1 > 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 > # # take p8p1 link down > # ip route show > 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 > 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown > 90.0.0.0/24 via 70.0.0.2 dev p7p1 > 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 10 dead linkdown > 100.0.0.0/24 > nexthop via 70.0.0.2 dev p7p1 weight 1 > nexthop via 80.0.0.2 dev p8p1 weight 1 dead linkdown > 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 > # ip route get 100.0.0.2 > 100.0.0.2 via 70.0.0.2 dev p7p1 src 70.0.0.1 > cache > # ip route get 100.0.0.2 > 100.0.0.2 via 70.0.0.2 dev p7p1 src 70.0.0.1 > cache > # # take p8p1 link up > # ip route show > 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 > 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 > 90.0.0.0/24 via 70.0.0.2 dev p7p1 > 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 10 > 100.0.0.0/24 > nexthop via 70.0.0.2 dev p7p1 weight 1 > nexthop via 80.0.0.2 dev p8p1 weight 1 > 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 > # ip route show > 100.0.0.2 via 70.0.0.2 dev p7p1 src 70.0.0.1 > cache > # ip route get 100.0.0.2 > 100.0.0.2 via 80.0.0.2 dev p8p1 src 80.0.0.1 > cache > # ip route get 100.0.0.2 > 100.0.0.2 via 70.0.0.2 dev p7p1 src 70.0.0.1 > cache > # ip route get 100.0.0.2 > 100.0.0.2 via 80.0.0.2 dev p8p1 src 80.0.0.1 > cache > # # you can see the round robin happening > # # take all ports p8p1 and p7p1 down > # ip route show > 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 dead linkdown > 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown > 90.0.0.0/24 via 70.0.0.2 dev p7p1 dead linkdown > 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 10 dead linkdown > 100.0.0.0/24 > nexthop via 70.0.0.2 dev p7p1 weight 1 dead linkdown > nexthop via 80.0.0.2 dev p8p1 weight 1 dead linkdown > 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 > # ip route get 100.0.0.2 > RTNETLINK answers: Network is unreachable > # ip route get 80.0.0.2 > RTNETLINK answers: Network is unreachable > # ip route get 80.0.0.1 > local 80.0.0.1 dev lo src 80.0.0.1 > cache <local> > # ip route get 70.0.0.1 > local 70.0.0.1 dev lo src 70.0.0.1 > cache <local> > # # local addrs are still reachable Perfect, looks good, thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek <gospo@cumulusnetworks.com> wrote: > Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are > reachable via an interface where carrier is off. No action is taken, > but additional flags are passed to userspace to indicate carrier status. Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar and I'm wondering if this could be done without introducing a new flag and just use RTNH_F_DEAD. The link change event would set RTNH_F_DEAD on nh on dev link down, and clear on link up. The sysctl knob would be something like "nexthop_dead_on_linkdown", default off. So basically expanding the ways RTNH_F_DEAD can be set. That would simplify the patch set quite a bit and require no changes to iproute2. -scott -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote: > On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek > <gospo@cumulusnetworks.com> wrote: > > Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are > > reachable via an interface where carrier is off. No action is taken, > > but additional flags are passed to userspace to indicate carrier status. > > Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar > and I'm wondering if this could be done without introducing a new flag > and just use RTNH_F_DEAD. The link change event would set RTNH_F_DEAD > on nh on dev link down, and clear on link up. The sysctl knob would > be something like "nexthop_dead_on_linkdown", default off. So > basically expanding the ways RTNH_F_DEAD can be set. That would > simplify the patch set quite a bit and require no changes to iproute2. > You are absolutely correct that what you describe would be less churn to userspace. From a functionality standpoint that is close to what was originally proposed, but Alex specifically did not like the behavioral change to what having RTNH_F_DEAD set (at least that was what I understood). That was what made me make the move to add this additional flag that was exported to userspace, so it was possible to differentiate the old dead routes/nexthop functionality from those that were not going to be dead due to link being down. At this point I think I prefer the additional data provided by the new flag exported to userspace. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jun 11, 2015 at 4:23 AM, Andy Gospodarek <gospo@cumulusnetworks.com> wrote: > On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote: >> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek >> <gospo@cumulusnetworks.com> wrote: >> > Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are >> > reachable via an interface where carrier is off. No action is taken, >> > but additional flags are passed to userspace to indicate carrier status. >> >> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar >> and I'm wondering if this could be done without introducing a new flag >> and just use RTNH_F_DEAD. The link change event would set RTNH_F_DEAD >> on nh on dev link down, and clear on link up. The sysctl knob would >> be something like "nexthop_dead_on_linkdown", default off. So >> basically expanding the ways RTNH_F_DEAD can be set. That would >> simplify the patch set quite a bit and require no changes to iproute2. >> > > You are absolutely correct that what you describe would be less churn to > userspace. From a functionality standpoint that is close to what was > originally proposed, but Alex specifically did not like the behavioral > change to what having RTNH_F_DEAD set (at least that was what I > understood). > > That was what made me make the move to add this additional flag that was > exported to userspace, so it was possible to differentiate the old dead > routes/nexthop functionality from those that were not going to be dead > due to link being down. Why does user space need to know _why_ a nh is dead? User space already knows the state (admin/link) of the nh dev. I not seeing why user space needs to differentiate why nh is dead. The kernel only needs to know if nh is dead to exclude nh from ecmp selection. Same for an offload device. Can you explain how this new flag provides user space more information than what's already available from RTM_NEWLINK notifications? > At this point I think I prefer the additional data provided by the new > flag exported to userspace. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/11/2015 04:23 AM, Andy Gospodarek wrote: > On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote: >> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek >> <gospo@cumulusnetworks.com> wrote: >>> Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are >>> reachable via an interface where carrier is off. No action is taken, >>> but additional flags are passed to userspace to indicate carrier status. >> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar >> and I'm wondering if this could be done without introducing a new flag >> and just use RTNH_F_DEAD. The link change event would set RTNH_F_DEAD >> on nh on dev link down, and clear on link up. The sysctl knob would >> be something like "nexthop_dead_on_linkdown", default off. So >> basically expanding the ways RTNH_F_DEAD can be set. That would >> simplify the patch set quite a bit and require no changes to iproute2. >> > You are absolutely correct that what you describe would be less churn to > userspace. From a functionality standpoint that is close to what was > originally proposed, but Alex specifically did not like the behavioral > change to what having RTNH_F_DEAD set (at least that was what I > understood). > > That was what made me make the move to add this additional flag that was > exported to userspace, so it was possible to differentiate the old dead > routes/nexthop functionality from those that were not going to be dead > due to link being down. > this point I think I prefer the additional data provided by the new > flag exported to userspace. I preferred the 2 flag solution as the original solution still required 2 flags, it just only exposed 1 to user-space. As a result it was much more error prone since it was fairly easy to get into a confused state about why the link was dead. With the 2 flag solution it becomes much easier to sort out why the route is not functional and it is much easier to isolate for things like the sysctl which only disables the use of LINKDOWN and not DEAD. - Alex -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jun 11, 2015 at 7:50 AM, Alexander Duyck <alexander.h.duyck@redhat.com> wrote: > > > On 06/11/2015 04:23 AM, Andy Gospodarek wrote: >> >> On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote: >>> >>> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek >>> <gospo@cumulusnetworks.com> wrote: >>>> >>>> Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are >>>> reachable via an interface where carrier is off. No action is taken, >>>> but additional flags are passed to userspace to indicate carrier status. >>> >>> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar >>> and I'm wondering if this could be done without introducing a new flag >>> and just use RTNH_F_DEAD. The link change event would set RTNH_F_DEAD >>> on nh on dev link down, and clear on link up. The sysctl knob would >>> be something like "nexthop_dead_on_linkdown", default off. So >>> basically expanding the ways RTNH_F_DEAD can be set. That would >>> simplify the patch set quite a bit and require no changes to iproute2. >>> >> You are absolutely correct that what you describe would be less churn to >> userspace. From a functionality standpoint that is close to what was >> originally proposed, but Alex specifically did not like the behavioral >> change to what having RTNH_F_DEAD set (at least that was what I >> understood). >> >> That was what made me make the move to add this additional flag that was >> exported to userspace, so it was possible to differentiate the old dead >> routes/nexthop functionality from those that were not going to be dead >> due to link being down. >> this point I think I prefer the additional data provided by the new >> flag exported to userspace. > > > I preferred the 2 flag solution as the original solution still required 2 > flags, it just only exposed 1 to user-space. As a result it was much more > error prone since it was fairly easy to get into a confused state about why > the link was dead. > > With the 2 flag solution it becomes much easier to sort out why the route is > not functional and it is much easier to isolate for things like the sysctl > which only disables the use of LINKDOWN and not DEAD. Ok, for the user troubleshooting, that make sense. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Yes, this is what I liked about the 2 flag solution too compared to the original. Dinesh On Thu, Jun 11, 2015 at 7:50 AM, Alexander Duyck <alexander.h.duyck@redhat.com> wrote: > > > On 06/11/2015 04:23 AM, Andy Gospodarek wrote: >> >> On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote: >>> >>> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek >>> <gospo@cumulusnetworks.com> wrote: >>>> >>>> Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are >>>> reachable via an interface where carrier is off. No action is taken, >>>> but additional flags are passed to userspace to indicate carrier status. >>> >>> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar >>> and I'm wondering if this could be done without introducing a new flag >>> and just use RTNH_F_DEAD. The link change event would set RTNH_F_DEAD >>> on nh on dev link down, and clear on link up. The sysctl knob would >>> be something like "nexthop_dead_on_linkdown", default off. So >>> basically expanding the ways RTNH_F_DEAD can be set. That would >>> simplify the patch set quite a bit and require no changes to iproute2. >>> >> You are absolutely correct that what you describe would be less churn to >> userspace. From a functionality standpoint that is close to what was >> originally proposed, but Alex specifically did not like the behavioral >> change to what having RTNH_F_DEAD set (at least that was what I >> understood). >> >> That was what made me make the move to add this additional flag that was >> exported to userspace, so it was possible to differentiate the old dead >> routes/nexthop functionality from those that were not going to be dead >> due to link being down. >> this point I think I prefer the additional data provided by the new >> flag exported to userspace. > > > I preferred the 2 flag solution as the original solution still required 2 > flags, it just only exposed 1 to user-space. As a result it was much more > error prone since it was fairly easy to get into a confused state about why > the link was dead. > > With the 2 flag solution it becomes much easier to sort out why the route is > not functional and it is much easier to isolate for things like the sysctl > which only disables the use of LINKDOWN and not DEAD. > > - Alex -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 54271ed..f73d27c 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -305,9 +305,9 @@ void fib_flush_external(struct net *net); /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); -int fib_sync_down_dev(struct net_device *dev, int force); +int fib_sync_down_dev(struct net_device *dev, unsigned long event); int fib_sync_down_addr(struct net *net, __be32 local); -int fib_sync_up(struct net_device *dev); +int fib_sync_up(struct net_device *dev, unsigned int nh_flags); void fib_select_multipath(struct fib_result *res); /* Exported by fib_trie.c */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 17fb02f..8ab874a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -338,6 +338,9 @@ struct rtnexthop { #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ #define RTNH_F_OFFLOAD 8 /* offloaded route */ +#define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ + +#define RTNH_F_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN) /* used as mask for route comparisons */ /* Macros to handle hexthops */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 872494e..872defb 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1063,9 +1063,9 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, unsigned long event) { - if (fib_sync_down_dev(dev, force)) + if (fib_sync_down_dev(dev, event)) fib_flush(dev_net(dev)); rt_cache_flush(dev_net(dev)); arp_ifdown(dev); @@ -1081,7 +1081,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); @@ -1093,7 +1093,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, event); } else { rt_cache_flush(dev_net(dev)); } @@ -1107,9 +1107,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; struct net *net = dev_net(dev); + unsigned flags; if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, event); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1124,16 +1125,21 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_add_ifaddr(ifa); } endfor_ifa(in_dev); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, event); break; - case NETDEV_CHANGEMTU: case NETDEV_CHANGE: + flags = dev_get_flags(dev); + if (flags & (IFF_RUNNING|IFF_LOWER_UP)) + fib_sync_up(dev, RTNH_F_LINKDOWN); + else + fib_sync_down_dev(dev, event); + case NETDEV_CHANGEMTU: rt_cache_flush(net); break; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 28ec3c1..496507f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -266,7 +266,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + ((nh->nh_flags ^ onh->nh_flags) & ~(RTNH_F_COMPARE_MASK))) return -1; onh++; } endfor_nexthops(fi); @@ -318,7 +318,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && - ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + ((nfi->fib_flags ^ fi->fib_flags) & ~(RTNH_F_COMPARE_MASK)) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -604,6 +604,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; @@ -636,6 +638,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (!dev) goto out; dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; @@ -654,6 +658,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->nh_dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = 0; } out: @@ -920,11 +926,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!nh->nh_dev) goto failure; } else { + int linkdown = 0; change_nexthops(fi) { err = fib_check_nh(cfg, fi, nexthop_nh); if (err != 0) goto failure; + if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + linkdown++; } endfor_nexthops(fi) + if (linkdown == fi->fib_nhs) { + fi->fib_flags |= RTNH_F_LINKDOWN; + } } if (fi->fib_prefsrc) { @@ -1103,7 +1115,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, int force) +int fib_sync_down_dev(struct net_device *dev, unsigned long event) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1112,7 +1124,8 @@ int fib_sync_down_dev(struct net_device *dev, int force) struct hlist_head *head = &fib_info_devhash[hash]; struct fib_nh *nh; - if (force) + if (event == NETDEV_UNREGISTER || + event == NETDEV_DOWN) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { - nexthop_nh->nh_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_nh->nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + break; + } #ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); fi->fib_power -= nexthop_nh->nh_power; @@ -1139,14 +1160,22 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nexthop_nh->nh_dev == dev) { + if (event == NETDEV_UNREGISTER && nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { - fi->fib_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + fi->fib_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + fi->fib_flags |= RTNH_F_LINKDOWN; + break; + } ret++; } } @@ -1210,13 +1239,11 @@ out: return; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev) +int fib_sync_up(struct net_device *dev, unsigned int nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1243,7 +1270,7 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & nh_flags)) { alive++; continue; } @@ -1254,14 +1281,18 @@ int fib_sync_up(struct net_device *dev) !__in_dev_get_rtnl(dev)) continue; alive++; +#ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~RTNH_F_DEAD; + nexthop_nh->nh_flags &= ~nh_flags; spin_unlock_bh(&fib_multipath_lock); +#else + nexthop_nh->nh_flags &= ~nh_flags; +#endif } endfor_nexthops(fi) if (alive > 0) { - fi->fib_flags &= ~RTNH_F_DEAD; + fi->fib_flags &= ~nh_flags; ret++; } } @@ -1269,6 +1300,8 @@ int fib_sync_up(struct net_device *dev) return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* * The algorithm is suboptimal, but it provides really * fair weighted route distribution.