diff mbox

[net-next,1/3,v3] net: track link-status of ipv4 nexthops

Message ID 1433990233-958-2-git-send-email-gospo@cumulusnetworks.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Andy Gospodarek June 11, 2015, 2:37 a.m. UTC
Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are
reachable via an interface where carrier is off.  No action is taken,
but additional flags are passed to userspace to indicate carrier status.

This also includes a cleanup to fib_disable_ip to more clearly indicate
what event made the function call to replace the more cryptic force
option previously used.

v2: Split out kernel functionality into 2 patches, this patch simply sets and
clears new nexthop flag RTNH_F_LINKDOWN.

v3: Cleanups suggested by Alex as well as a bug noticed in
fib_sync_down_dev and fib_sync_up when multipath was not enabled.

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
---
 include/net/ip_fib.h           |  4 +--
 include/uapi/linux/rtnetlink.h |  3 +++
 net/ipv4/fib_frontend.c        | 22 ++++++++++------
 net/ipv4/fib_semantics.c       | 59 ++++++++++++++++++++++++++++++++----------
 4 files changed, 65 insertions(+), 23 deletions(-)

Comments

Scott Feldman June 11, 2015, 2:53 a.m. UTC | #1
On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
<gospo@cumulusnetworks.com> wrote:

> @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force)
>                                 dead++;
>                         else if (nexthop_nh->nh_dev == dev &&
>                                  nexthop_nh->nh_scope != scope) {
> -                               nexthop_nh->nh_flags |= RTNH_F_DEAD;
> +                               switch (event) {
> +                               case NETDEV_DOWN:
> +                               case NETDEV_UNREGISTER:
> +                                       nexthop_nh->nh_flags |= RTNH_F_DEAD;
> +                                       /* fall through */
> +                               case NETDEV_CHANGE:
> +                                       nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
> +                                       break;
> +                               }
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>                                 spin_lock_bh(&fib_multipath_lock);
>                                 fi->fib_power -= nexthop_nh->nh_power;
> @@ -1139,14 +1160,22 @@ int fib_sync_down_dev(struct net_device *dev, int force)
>                                 dead++;
>                         }
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -                       if (force > 1 && nexthop_nh->nh_dev == dev) {
> +                       if (event == NETDEV_UNREGISTER && nexthop_nh->nh_dev == dev) {
>                                 dead = fi->fib_nhs;
>                                 break;
>                         }
>  #endif
>                 } endfor_nexthops(fi)
>                 if (dead == fi->fib_nhs) {
> -                       fi->fib_flags |= RTNH_F_DEAD;
> +                       switch (event) {
> +                       case NETDEV_DOWN:
> +                       case NETDEV_UNREGISTER:
> +                               fi->fib_flags |= RTNH_F_DEAD;
> +                               /* fall through */
> +                       case NETDEV_CHANGE:
> +                               fi->fib_flags |= RTNH_F_LINKDOWN;

RTNH_F_LINKDOWN is to mark linkdown nexthop devs....why is the route
fi being marked RTNH_F_LINKDOWN?

The RTNH_F_LINKDOWN comment says:

#define RTNH_F_LINKDOWN                16      /* carrier-down on nexthop */

It's a per-nh flag, not per-route flag, correct?

Can you show an ECMP example with only a subset of the nexthops dev
linkdowned?  Show the ip route output after going thru some link
down/up events on some of the nexthops devs.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Gospodarek June 11, 2015, 3:28 a.m. UTC | #2
On Wed, Jun 10, 2015 at 07:53:59PM -0700, Scott Feldman wrote:
> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
> <gospo@cumulusnetworks.com> wrote:
> 
> > @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force)
> >                                 dead++;
> >                         else if (nexthop_nh->nh_dev == dev &&
> >                                  nexthop_nh->nh_scope != scope) {
> > -                               nexthop_nh->nh_flags |= RTNH_F_DEAD;
> > +                               switch (event) {
> > +                               case NETDEV_DOWN:
> > +                               case NETDEV_UNREGISTER:
> > +                                       nexthop_nh->nh_flags |= RTNH_F_DEAD;
> > +                                       /* fall through */
> > +                               case NETDEV_CHANGE:
> > +                                       nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
> > +                                       break;
> > +                               }
> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >                                 spin_lock_bh(&fib_multipath_lock);
> >                                 fi->fib_power -= nexthop_nh->nh_power;
> > @@ -1139,14 +1160,22 @@ int fib_sync_down_dev(struct net_device *dev, int force)
> >                                 dead++;
> >                         }
> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -                       if (force > 1 && nexthop_nh->nh_dev == dev) {
> > +                       if (event == NETDEV_UNREGISTER && nexthop_nh->nh_dev == dev) {
> >                                 dead = fi->fib_nhs;
> >                                 break;
> >                         }
> >  #endif
> >                 } endfor_nexthops(fi)
> >                 if (dead == fi->fib_nhs) {
> > -                       fi->fib_flags |= RTNH_F_DEAD;
> > +                       switch (event) {
> > +                       case NETDEV_DOWN:
> > +                       case NETDEV_UNREGISTER:
> > +                               fi->fib_flags |= RTNH_F_DEAD;
> > +                               /* fall through */
> > +                       case NETDEV_CHANGE:
> > +                               fi->fib_flags |= RTNH_F_LINKDOWN;
> 
> RTNH_F_LINKDOWN is to mark linkdown nexthop devs....why is the route
> fi being marked RTNH_F_LINKDOWN?
> 
> The RTNH_F_LINKDOWN comment says:
> 
> #define RTNH_F_LINKDOWN                16      /* carrier-down on nexthop */

This is done with the dead flag already.  I'm actually following the
precedent already set there.

> It's a per-nh flag, not per-route flag, correct?
> 
> Can you show an ECMP example with only a subset of the nexthops dev
> linkdowned?  Show the ip route output after going thru some link
> down/up events on some of the nexthops devs.

Sure!  This is exactly what I've been using for testing.

# ip route show
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1
90.0.0.0/24 via 70.0.0.2 dev p7p1
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 10
100.0.0.0/24
	nexthop via 70.0.0.2  dev p7p1 weight 1
	nexthop via 80.0.0.2  dev p8p1 weight 1
192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2 
# # take p8p1 link down
# ip route show 
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1 
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 dead linkdown 
90.0.0.0/24 via 70.0.0.2 dev p7p1 
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 10 dead linkdown 
100.0.0.0/24 
	nexthop via 70.0.0.2  dev p7p1 weight 1
	nexthop via 80.0.0.2  dev p8p1 weight 1 dead linkdown
192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2 
# ip route get 100.0.0.2 
100.0.0.2 via 70.0.0.2 dev p7p1  src 70.0.0.1 
    cache 
# ip route get 100.0.0.2 
100.0.0.2 via 70.0.0.2 dev p7p1  src 70.0.0.1 
    cache 
# # take p8p1 link up
# ip route show
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1 
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1
90.0.0.0/24 via 70.0.0.2 dev p7p1 
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 10 
100.0.0.0/24 
	nexthop via 70.0.0.2  dev p7p1 weight 1
	nexthop via 80.0.0.2  dev p8p1 weight 1
192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2 
# ip route show 
100.0.0.2 via 70.0.0.2 dev p7p1  src 70.0.0.1 
    cache 
# ip route get 100.0.0.2 
100.0.0.2 via 80.0.0.2 dev p8p1  src 80.0.0.1 
    cache 
# ip route get 100.0.0.2 
100.0.0.2 via 70.0.0.2 dev p7p1  src 70.0.0.1 
    cache 
# ip route get 100.0.0.2 
100.0.0.2 via 80.0.0.2 dev p8p1  src 80.0.0.1 
    cache 
# # you can see the round robin happening
# # take all ports p8p1 and p7p1 down
# ip route show
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1 dead linkdown 
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 dead linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1 dead linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 10 dead linkdown
100.0.0.0/24
	nexthop via 70.0.0.2  dev p7p1 weight 1 dead linkdown
	nexthop via 80.0.0.2  dev p8p1 weight 1 dead linkdown
192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2 
# ip route get 100.0.0.2 
RTNETLINK answers: Network is unreachable
# ip route get 80.0.0.2 
RTNETLINK answers: Network is unreachable
# ip route get 80.0.0.1
local 80.0.0.1 dev lo  src 80.0.0.1 
    cache <local> 
# ip route get 70.0.0.1
local 70.0.0.1 dev lo  src 70.0.0.1 
    cache <local> 
# # local addrs are still reachable 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman June 11, 2015, 4:14 a.m. UTC | #3
On Wed, Jun 10, 2015 at 8:28 PM, Andy Gospodarek
<gospo@cumulusnetworks.com> wrote:
> On Wed, Jun 10, 2015 at 07:53:59PM -0700, Scott Feldman wrote:
>> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
>> <gospo@cumulusnetworks.com> wrote:
>>
>> > @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force)
>> >                                 dead++;
>> >                         else if (nexthop_nh->nh_dev == dev &&
>> >                                  nexthop_nh->nh_scope != scope) {
>> > -                               nexthop_nh->nh_flags |= RTNH_F_DEAD;
>> > +                               switch (event) {
>> > +                               case NETDEV_DOWN:
>> > +                               case NETDEV_UNREGISTER:
>> > +                                       nexthop_nh->nh_flags |= RTNH_F_DEAD;
>> > +                                       /* fall through */
>> > +                               case NETDEV_CHANGE:
>> > +                                       nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
>> > +                                       break;
>> > +                               }
>> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> >                                 spin_lock_bh(&fib_multipath_lock);
>> >                                 fi->fib_power -= nexthop_nh->nh_power;
>> > @@ -1139,14 +1160,22 @@ int fib_sync_down_dev(struct net_device *dev, int force)
>> >                                 dead++;
>> >                         }
>> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> > -                       if (force > 1 && nexthop_nh->nh_dev == dev) {
>> > +                       if (event == NETDEV_UNREGISTER && nexthop_nh->nh_dev == dev) {
>> >                                 dead = fi->fib_nhs;
>> >                                 break;
>> >                         }
>> >  #endif
>> >                 } endfor_nexthops(fi)
>> >                 if (dead == fi->fib_nhs) {
>> > -                       fi->fib_flags |= RTNH_F_DEAD;
>> > +                       switch (event) {
>> > +                       case NETDEV_DOWN:
>> > +                       case NETDEV_UNREGISTER:
>> > +                               fi->fib_flags |= RTNH_F_DEAD;
>> > +                               /* fall through */
>> > +                       case NETDEV_CHANGE:
>> > +                               fi->fib_flags |= RTNH_F_LINKDOWN;
>>
>> RTNH_F_LINKDOWN is to mark linkdown nexthop devs....why is the route
>> fi being marked RTNH_F_LINKDOWN?
>>
>> The RTNH_F_LINKDOWN comment says:
>>
>> #define RTNH_F_LINKDOWN                16      /* carrier-down on nexthop */
>
> This is done with the dead flag already.  I'm actually following the
> precedent already set there.
>
>> It's a per-nh flag, not per-route flag, correct?
>>
>> Can you show an ECMP example with only a subset of the nexthops dev
>> linkdowned?  Show the ip route output after going thru some link
>> down/up events on some of the nexthops devs.
>
> Sure!  This is exactly what I've been using for testing.
>
> # ip route show
> 70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
> 80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1
> 90.0.0.0/24 via 70.0.0.2 dev p7p1
> 90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 10
> 100.0.0.0/24
>         nexthop via 70.0.0.2  dev p7p1 weight 1
>         nexthop via 80.0.0.2  dev p8p1 weight 1
> 192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2
> # # take p8p1 link down
> # ip route show
> 70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
> 80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 dead linkdown
> 90.0.0.0/24 via 70.0.0.2 dev p7p1
> 90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 10 dead linkdown
> 100.0.0.0/24
>         nexthop via 70.0.0.2  dev p7p1 weight 1
>         nexthop via 80.0.0.2  dev p8p1 weight 1 dead linkdown
> 192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2
> # ip route get 100.0.0.2
> 100.0.0.2 via 70.0.0.2 dev p7p1  src 70.0.0.1
>     cache
> # ip route get 100.0.0.2
> 100.0.0.2 via 70.0.0.2 dev p7p1  src 70.0.0.1
>     cache
> # # take p8p1 link up
> # ip route show
> 70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
> 80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1
> 90.0.0.0/24 via 70.0.0.2 dev p7p1
> 90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 10
> 100.0.0.0/24
>         nexthop via 70.0.0.2  dev p7p1 weight 1
>         nexthop via 80.0.0.2  dev p8p1 weight 1
> 192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2
> # ip route show
> 100.0.0.2 via 70.0.0.2 dev p7p1  src 70.0.0.1
>     cache
> # ip route get 100.0.0.2
> 100.0.0.2 via 80.0.0.2 dev p8p1  src 80.0.0.1
>     cache
> # ip route get 100.0.0.2
> 100.0.0.2 via 70.0.0.2 dev p7p1  src 70.0.0.1
>     cache
> # ip route get 100.0.0.2
> 100.0.0.2 via 80.0.0.2 dev p8p1  src 80.0.0.1
>     cache
> # # you can see the round robin happening
> # # take all ports p8p1 and p7p1 down
> # ip route show
> 70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1 dead linkdown
> 80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 dead linkdown
> 90.0.0.0/24 via 70.0.0.2 dev p7p1 dead linkdown
> 90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 10 dead linkdown
> 100.0.0.0/24
>         nexthop via 70.0.0.2  dev p7p1 weight 1 dead linkdown
>         nexthop via 80.0.0.2  dev p8p1 weight 1 dead linkdown
> 192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2
> # ip route get 100.0.0.2
> RTNETLINK answers: Network is unreachable
> # ip route get 80.0.0.2
> RTNETLINK answers: Network is unreachable
> # ip route get 80.0.0.1
> local 80.0.0.1 dev lo  src 80.0.0.1
>     cache <local>
> # ip route get 70.0.0.1
> local 70.0.0.1 dev lo  src 70.0.0.1
>     cache <local>
> # # local addrs are still reachable

Perfect, looks good, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman June 11, 2015, 6:07 a.m. UTC | #4
On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
<gospo@cumulusnetworks.com> wrote:
> Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are
> reachable via an interface where carrier is off.  No action is taken,
> but additional flags are passed to userspace to indicate carrier status.

Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar
and I'm wondering if this could be done without introducing a new flag
and just use RTNH_F_DEAD.  The link change event would set RTNH_F_DEAD
on nh on dev link down, and clear on link up.  The sysctl knob would
be something like "nexthop_dead_on_linkdown", default off.  So
basically expanding the ways RTNH_F_DEAD can be set.  That would
simplify the patch set quite a bit and require no changes to iproute2.

-scott
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Gospodarek June 11, 2015, 11:23 a.m. UTC | #5
On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote:
> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
> <gospo@cumulusnetworks.com> wrote:
> > Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are
> > reachable via an interface where carrier is off.  No action is taken,
> > but additional flags are passed to userspace to indicate carrier status.
> 
> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar
> and I'm wondering if this could be done without introducing a new flag
> and just use RTNH_F_DEAD.  The link change event would set RTNH_F_DEAD
> on nh on dev link down, and clear on link up.  The sysctl knob would
> be something like "nexthop_dead_on_linkdown", default off.  So
> basically expanding the ways RTNH_F_DEAD can be set.  That would
> simplify the patch set quite a bit and require no changes to iproute2.
> 

You are absolutely correct that what you describe would be less churn to
userspace.  From a functionality standpoint that is close to what was
originally proposed, but Alex specifically did not like the behavioral
change to what having RTNH_F_DEAD set (at least that was what I
understood).

That was what made me make the move to add this additional flag that was
exported to userspace, so it was possible to differentiate the old dead
routes/nexthop functionality from those that were not going to be dead
due to link being down.  

At this point I think I prefer the additional data provided by the new
flag exported to userspace.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman June 11, 2015, 2:47 p.m. UTC | #6
On Thu, Jun 11, 2015 at 4:23 AM, Andy Gospodarek
<gospo@cumulusnetworks.com> wrote:
> On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote:
>> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
>> <gospo@cumulusnetworks.com> wrote:
>> > Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are
>> > reachable via an interface where carrier is off.  No action is taken,
>> > but additional flags are passed to userspace to indicate carrier status.
>>
>> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar
>> and I'm wondering if this could be done without introducing a new flag
>> and just use RTNH_F_DEAD.  The link change event would set RTNH_F_DEAD
>> on nh on dev link down, and clear on link up.  The sysctl knob would
>> be something like "nexthop_dead_on_linkdown", default off.  So
>> basically expanding the ways RTNH_F_DEAD can be set.  That would
>> simplify the patch set quite a bit and require no changes to iproute2.
>>
>
> You are absolutely correct that what you describe would be less churn to
> userspace.  From a functionality standpoint that is close to what was
> originally proposed, but Alex specifically did not like the behavioral
> change to what having RTNH_F_DEAD set (at least that was what I
> understood).
>
> That was what made me make the move to add this additional flag that was
> exported to userspace, so it was possible to differentiate the old dead
> routes/nexthop functionality from those that were not going to be dead
> due to link being down.

Why does user space need to know _why_ a nh is dead?  User space
already knows the state (admin/link) of the nh dev.

I not seeing why user space needs to differentiate why nh is dead.
The kernel only needs to know if nh is dead to exclude nh from ecmp
selection.  Same for an offload device.

Can you explain how this new flag provides user space more information
than what's already available from RTM_NEWLINK notifications?

> At this point I think I prefer the additional data provided by the new
> flag exported to userspace.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Duyck June 11, 2015, 2:50 p.m. UTC | #7
On 06/11/2015 04:23 AM, Andy Gospodarek wrote:
> On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote:
>> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
>> <gospo@cumulusnetworks.com> wrote:
>>> Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are
>>> reachable via an interface where carrier is off.  No action is taken,
>>> but additional flags are passed to userspace to indicate carrier status.
>> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar
>> and I'm wondering if this could be done without introducing a new flag
>> and just use RTNH_F_DEAD.  The link change event would set RTNH_F_DEAD
>> on nh on dev link down, and clear on link up.  The sysctl knob would
>> be something like "nexthop_dead_on_linkdown", default off.  So
>> basically expanding the ways RTNH_F_DEAD can be set.  That would
>> simplify the patch set quite a bit and require no changes to iproute2.
>>
> You are absolutely correct that what you describe would be less churn to
> userspace.  From a functionality standpoint that is close to what was
> originally proposed, but Alex specifically did not like the behavioral
> change to what having RTNH_F_DEAD set (at least that was what I
> understood).
>
> That was what made me make the move to add this additional flag that was
> exported to userspace, so it was possible to differentiate the old dead
> routes/nexthop functionality from those that were not going to be dead
> due to link being down.
>   this point I think I prefer the additional data provided by the new
> flag exported to userspace.

I preferred the 2 flag solution as the original solution still required 
2 flags, it just only exposed 1 to user-space.  As a result it was much 
more error prone since it was fairly easy to get into a confused state 
about why the link was dead.

With the 2 flag solution it becomes much easier to sort out why the 
route is not functional and it is much easier to isolate for things like 
the sysctl which only disables the use of LINKDOWN and not DEAD.

- Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman June 11, 2015, 3:25 p.m. UTC | #8
On Thu, Jun 11, 2015 at 7:50 AM, Alexander Duyck
<alexander.h.duyck@redhat.com> wrote:
>
>
> On 06/11/2015 04:23 AM, Andy Gospodarek wrote:
>>
>> On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote:
>>>
>>> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
>>> <gospo@cumulusnetworks.com> wrote:
>>>>
>>>> Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are
>>>> reachable via an interface where carrier is off.  No action is taken,
>>>> but additional flags are passed to userspace to indicate carrier status.
>>>
>>> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar
>>> and I'm wondering if this could be done without introducing a new flag
>>> and just use RTNH_F_DEAD.  The link change event would set RTNH_F_DEAD
>>> on nh on dev link down, and clear on link up.  The sysctl knob would
>>> be something like "nexthop_dead_on_linkdown", default off.  So
>>> basically expanding the ways RTNH_F_DEAD can be set.  That would
>>> simplify the patch set quite a bit and require no changes to iproute2.
>>>
>> You are absolutely correct that what you describe would be less churn to
>> userspace.  From a functionality standpoint that is close to what was
>> originally proposed, but Alex specifically did not like the behavioral
>> change to what having RTNH_F_DEAD set (at least that was what I
>> understood).
>>
>> That was what made me make the move to add this additional flag that was
>> exported to userspace, so it was possible to differentiate the old dead
>> routes/nexthop functionality from those that were not going to be dead
>> due to link being down.
>>   this point I think I prefer the additional data provided by the new
>> flag exported to userspace.
>
>
> I preferred the 2 flag solution as the original solution still required 2
> flags, it just only exposed 1 to user-space.  As a result it was much more
> error prone since it was fairly easy to get into a confused state about why
> the link was dead.
>
> With the 2 flag solution it becomes much easier to sort out why the route is
> not functional and it is much easier to isolate for things like the sysctl
> which only disables the use of LINKDOWN and not DEAD.

Ok, for the user troubleshooting, that make sense.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dinesh Dutt June 11, 2015, 4:53 p.m. UTC | #9
Yes, this is what I liked about the 2 flag solution too compared to
the original.

Dinesh

On Thu, Jun 11, 2015 at 7:50 AM, Alexander Duyck
<alexander.h.duyck@redhat.com> wrote:
>
>
> On 06/11/2015 04:23 AM, Andy Gospodarek wrote:
>>
>> On Wed, Jun 10, 2015 at 11:07:28PM -0700, Scott Feldman wrote:
>>>
>>> On Wed, Jun 10, 2015 at 7:37 PM, Andy Gospodarek
>>> <gospo@cumulusnetworks.com> wrote:
>>>>
>>>> Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are
>>>> reachable via an interface where carrier is off.  No action is taken,
>>>> but additional flags are passed to userspace to indicate carrier status.
>>>
>>> Andy, it seems now RTNH_F_LINKDOWN and RTNH_F_DEAD are very similar
>>> and I'm wondering if this could be done without introducing a new flag
>>> and just use RTNH_F_DEAD.  The link change event would set RTNH_F_DEAD
>>> on nh on dev link down, and clear on link up.  The sysctl knob would
>>> be something like "nexthop_dead_on_linkdown", default off.  So
>>> basically expanding the ways RTNH_F_DEAD can be set.  That would
>>> simplify the patch set quite a bit and require no changes to iproute2.
>>>
>> You are absolutely correct that what you describe would be less churn to
>> userspace.  From a functionality standpoint that is close to what was
>> originally proposed, but Alex specifically did not like the behavioral
>> change to what having RTNH_F_DEAD set (at least that was what I
>> understood).
>>
>> That was what made me make the move to add this additional flag that was
>> exported to userspace, so it was possible to differentiate the old dead
>> routes/nexthop functionality from those that were not going to be dead
>> due to link being down.
>>   this point I think I prefer the additional data provided by the new
>> flag exported to userspace.
>
>
> I preferred the 2 flag solution as the original solution still required 2
> flags, it just only exposed 1 to user-space.  As a result it was much more
> error prone since it was fairly easy to get into a confused state about why
> the link was dead.
>
> With the 2 flag solution it becomes much easier to sort out why the route is
> not functional and it is much easier to isolate for things like the sysctl
> which only disables the use of LINKDOWN and not DEAD.
>
> - Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 54271ed..f73d27c 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -305,9 +305,9 @@  void fib_flush_external(struct net *net);
 
 /* Exported by fib_semantics.c */
 int ip_fib_check_default(__be32 gw, struct net_device *dev);
-int fib_sync_down_dev(struct net_device *dev, int force);
+int fib_sync_down_dev(struct net_device *dev, unsigned long event);
 int fib_sync_down_addr(struct net *net, __be32 local);
-int fib_sync_up(struct net_device *dev);
+int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 void fib_select_multipath(struct fib_result *res);
 
 /* Exported by fib_trie.c */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 17fb02f..8ab874a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -338,6 +338,9 @@  struct rtnexthop {
 #define RTNH_F_PERVASIVE	2	/* Do recursive gateway lookup	*/
 #define RTNH_F_ONLINK		4	/* Gateway is forced on link	*/
 #define RTNH_F_OFFLOAD		8	/* offloaded route */
+#define RTNH_F_LINKDOWN		16	/* carrier-down on nexthop */
+
+#define RTNH_F_COMPARE_MASK	(RTNH_F_DEAD | RTNH_F_LINKDOWN) /* used as mask for route comparisons */
 
 /* Macros to handle hexthops */
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 872494e..872defb 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1063,9 +1063,9 @@  static void nl_fib_lookup_exit(struct net *net)
 	net->ipv4.fibnl = NULL;
 }
 
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, unsigned long event)
 {
-	if (fib_sync_down_dev(dev, force))
+	if (fib_sync_down_dev(dev, event))
 		fib_flush(dev_net(dev));
 	rt_cache_flush(dev_net(dev));
 	arp_ifdown(dev);
@@ -1081,7 +1081,7 @@  static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 	case NETDEV_UP:
 		fib_add_ifaddr(ifa);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-		fib_sync_up(dev);
+		fib_sync_up(dev, RTNH_F_DEAD);
 #endif
 		atomic_inc(&net->ipv4.dev_addr_genid);
 		rt_cache_flush(dev_net(dev));
@@ -1093,7 +1093,7 @@  static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 			/* Last address was deleted from this interface.
 			 * Disable IP.
 			 */
-			fib_disable_ip(dev, 1);
+			fib_disable_ip(dev, event);
 		} else {
 			rt_cache_flush(dev_net(dev));
 		}
@@ -1107,9 +1107,10 @@  static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct in_device *in_dev;
 	struct net *net = dev_net(dev);
+	unsigned flags;
 
 	if (event == NETDEV_UNREGISTER) {
-		fib_disable_ip(dev, 2);
+		fib_disable_ip(dev, event);
 		rt_flush_dev(dev);
 		return NOTIFY_DONE;
 	}
@@ -1124,16 +1125,21 @@  static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 			fib_add_ifaddr(ifa);
 		} endfor_ifa(in_dev);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-		fib_sync_up(dev);
+		fib_sync_up(dev, RTNH_F_DEAD);
 #endif
 		atomic_inc(&net->ipv4.dev_addr_genid);
 		rt_cache_flush(net);
 		break;
 	case NETDEV_DOWN:
-		fib_disable_ip(dev, 0);
+		fib_disable_ip(dev, event);
 		break;
-	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGE:
+		flags = dev_get_flags(dev);
+		if (flags & (IFF_RUNNING|IFF_LOWER_UP))
+			fib_sync_up(dev, RTNH_F_LINKDOWN);
+		else
+			fib_sync_down_dev(dev, event);
+	case NETDEV_CHANGEMTU:
 		rt_cache_flush(net);
 		break;
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 28ec3c1..496507f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -266,7 +266,7 @@  static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+		    ((nh->nh_flags ^ onh->nh_flags) & ~(RTNH_F_COMPARE_MASK)))
 			return -1;
 		onh++;
 	} endfor_nexthops(fi);
@@ -318,7 +318,7 @@  static struct fib_info *fib_find_info(const struct fib_info *nfi)
 		    nfi->fib_type == fi->fib_type &&
 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
 			   sizeof(u32) * RTAX_MAX) == 0 &&
-		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
+		    ((nfi->fib_flags ^ fi->fib_flags) & ~(RTNH_F_COMPARE_MASK)) == 0 &&
 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
 			return fi;
 	}
@@ -604,6 +604,8 @@  static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 				return -ENODEV;
 			if (!(dev->flags & IFF_UP))
 				return -ENETDOWN;
+			if (!netif_carrier_ok(dev))
+				nh->nh_flags |= RTNH_F_LINKDOWN;
 			nh->nh_dev = dev;
 			dev_hold(dev);
 			nh->nh_scope = RT_SCOPE_LINK;
@@ -636,6 +638,8 @@  static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 		if (!dev)
 			goto out;
 		dev_hold(dev);
+		if (!netif_carrier_ok(dev))
+			nh->nh_flags |= RTNH_F_LINKDOWN;
 		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
 	} else {
 		struct in_device *in_dev;
@@ -654,6 +658,8 @@  static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 		nh->nh_dev = in_dev->dev;
 		dev_hold(nh->nh_dev);
 		nh->nh_scope = RT_SCOPE_HOST;
+		if (!netif_carrier_ok(nh->nh_dev))
+			nh->nh_flags |= RTNH_F_LINKDOWN;
 		err = 0;
 	}
 out:
@@ -920,11 +926,17 @@  struct fib_info *fib_create_info(struct fib_config *cfg)
 		if (!nh->nh_dev)
 			goto failure;
 	} else {
+		int linkdown = 0;
 		change_nexthops(fi) {
 			err = fib_check_nh(cfg, fi, nexthop_nh);
 			if (err != 0)
 				goto failure;
+			if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+				linkdown++;
 		} endfor_nexthops(fi)
+		if (linkdown == fi->fib_nhs) {
+			fi->fib_flags |= RTNH_F_LINKDOWN;
+		}
 	}
 
 	if (fi->fib_prefsrc) {
@@ -1103,7 +1115,7 @@  int fib_sync_down_addr(struct net *net, __be32 local)
 	return ret;
 }
 
-int fib_sync_down_dev(struct net_device *dev, int force)
+int fib_sync_down_dev(struct net_device *dev, unsigned long event)
 {
 	int ret = 0;
 	int scope = RT_SCOPE_NOWHERE;
@@ -1112,7 +1124,8 @@  int fib_sync_down_dev(struct net_device *dev, int force)
 	struct hlist_head *head = &fib_info_devhash[hash];
 	struct fib_nh *nh;
 
-	if (force)
+	if (event == NETDEV_UNREGISTER ||
+	    event == NETDEV_DOWN)
 		scope = -1;
 
 	hlist_for_each_entry(nh, head, nh_hash) {
@@ -1129,7 +1142,15 @@  int fib_sync_down_dev(struct net_device *dev, int force)
 				dead++;
 			else if (nexthop_nh->nh_dev == dev &&
 				 nexthop_nh->nh_scope != scope) {
-				nexthop_nh->nh_flags |= RTNH_F_DEAD;
+				switch (event) {
+				case NETDEV_DOWN:
+				case NETDEV_UNREGISTER:
+					nexthop_nh->nh_flags |= RTNH_F_DEAD;
+					/* fall through */
+				case NETDEV_CHANGE:
+					nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
+					break;
+				}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 				spin_lock_bh(&fib_multipath_lock);
 				fi->fib_power -= nexthop_nh->nh_power;
@@ -1139,14 +1160,22 @@  int fib_sync_down_dev(struct net_device *dev, int force)
 				dead++;
 			}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-			if (force > 1 && nexthop_nh->nh_dev == dev) {
+			if (event == NETDEV_UNREGISTER && nexthop_nh->nh_dev == dev) {
 				dead = fi->fib_nhs;
 				break;
 			}
 #endif
 		} endfor_nexthops(fi)
 		if (dead == fi->fib_nhs) {
-			fi->fib_flags |= RTNH_F_DEAD;
+			switch (event) {
+			case NETDEV_DOWN:
+			case NETDEV_UNREGISTER:
+				fi->fib_flags |= RTNH_F_DEAD;
+				/* fall through */
+			case NETDEV_CHANGE:
+				fi->fib_flags |= RTNH_F_LINKDOWN;
+				break;
+			}
 			ret++;
 		}
 	}
@@ -1210,13 +1239,11 @@  out:
 	return;
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
 /*
  * Dead device goes up. We wake up dead nexthops.
  * It takes sense only on multipath routes.
  */
-int fib_sync_up(struct net_device *dev)
+int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 {
 	struct fib_info *prev_fi;
 	unsigned int hash;
@@ -1243,7 +1270,7 @@  int fib_sync_up(struct net_device *dev)
 		prev_fi = fi;
 		alive = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+			if (!(nexthop_nh->nh_flags & nh_flags)) {
 				alive++;
 				continue;
 			}
@@ -1254,14 +1281,18 @@  int fib_sync_up(struct net_device *dev)
 			    !__in_dev_get_rtnl(dev))
 				continue;
 			alive++;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
 			spin_lock_bh(&fib_multipath_lock);
 			nexthop_nh->nh_power = 0;
-			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
+			nexthop_nh->nh_flags &= ~nh_flags;
 			spin_unlock_bh(&fib_multipath_lock);
+#else
+			nexthop_nh->nh_flags &= ~nh_flags;
+#endif
 		} endfor_nexthops(fi)
 
 		if (alive > 0) {
-			fi->fib_flags &= ~RTNH_F_DEAD;
+			fi->fib_flags &= ~nh_flags;
 			ret++;
 		}
 	}
@@ -1269,6 +1300,8 @@  int fib_sync_up(struct net_device *dev)
 	return ret;
 }
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
 /*
  * The algorithm is suboptimal, but it provides really
  * fair weighted route distribution.