diff mbox series

[v2,net-next,4/7] ipv6: Plumb support for nexthop object in a fib6_info

Message ID 20190603040817.4825-5-dsahern@kernel.org
State Superseded
Delegated to: David Miller
Headers show
Series net: add struct nexthop to fib{6}_info | expand

Commit Message

David Ahern June 3, 2019, 4:08 a.m. UTC
From: David Ahern <dsahern@gmail.com>

Add struct nexthop and nh_list list_head to fib6_info. nh_list is the
fib6_info side of the nexthop <-> fib_info relationship. Since a fib6_info
referencing a nexthop object can not have 'sibling' entries (the old way
of doing multipath routes), the nh_list is a union with fib6_siblings.

Add f6i_list list_head to 'struct nexthop' to track fib6_info entries
using a nexthop instance. Update __remove_nexthop_fib to walk f6_list
and delete fib entries using the nexthop.

Add a few nexthop helpers for use when a nexthop is added to fib6_info:
- nexthop_fib6_nh - return first fib6_nh in a nexthop object
- fib6_info_nh_dev moved to nexthop.h and updated to use nexthop_fib6_nh
  if the fib6_info references a nexthop object
- nexthop_path_fib6_result - similar to ipv4, select a path within a
  multipath nexthop object. If the nexthop is a blackhole, set
  fib6_result type to RTN_BLACKHOLE, and set the REJECT flag

Update the fib6_info references to check for nh and take a different path
as needed:
- rt6_qualify_for_ecmp - if a fib entry uses a nexthop object it can NOT
  be coalesced with other fib entries into a multipath route
- rt6_duplicate_nexthop - use nexthop_cmp if either fib6_info references
  a nexthop
- addrconf (host routes), RA's and info entries (anything configured via
  ndisc) does not use nexthop objects
- fib6_info_destroy_rcu - put reference to nexthop object
- fib6_purge_rt - drop fib6_info from f6i_list
- fib6_select_path - update to use the new nexthop_path_fib6_result when
  fib entry uses a nexthop object
- rt6_device_match - update to catch use of nexthop object as a blackhole
  and set fib6_type and flags.
- ip6_pol_route - detect the REJECT flag getting set for blackhole nexthop
  and jump to ip6_create_rt_rcu
- ip6_route_info_create - don't add space for fib6_nh if fib entry is
  going to reference a nexthop object, take a reference to nexthop object,
  disallow use of source routing
- rt6_nlmsg_size - add space for RTA_NH_ID
- add rt6_fill_node_nexthop to add nexthop data on a dump

As with ipv4, most of the changes push existing code into the else branch
of whether the fib entry uses a nexthop object.

Update the nexthop code to walk f6i_list on a nexthop deleted to remove
fib entries referencing it.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h   |  11 ++--
 include/net/ip6_route.h |  13 +++-
 include/net/nexthop.h   |  50 ++++++++++++++++
 net/ipv4/nexthop.c      |  44 ++++++++++++++
 net/ipv6/addrconf.c     |   5 ++
 net/ipv6/ip6_fib.c      |  22 +++++--
 net/ipv6/ndisc.c        |   3 +-
 net/ipv6/route.c        | 156 +++++++++++++++++++++++++++++++++++++++++-------
 8 files changed, 268 insertions(+), 36 deletions(-)

Comments

Wei Wang June 3, 2019, 6:09 p.m. UTC | #1
On Sun, Jun 2, 2019 at 9:08 PM David Ahern <dsahern@kernel.org> wrote:
>
> From: David Ahern <dsahern@gmail.com>
>
> Add struct nexthop and nh_list list_head to fib6_info. nh_list is the
> fib6_info side of the nexthop <-> fib_info relationship. Since a fib6_info
> referencing a nexthop object can not have 'sibling' entries (the old way
> of doing multipath routes), the nh_list is a union with fib6_siblings.
>
> Add f6i_list list_head to 'struct nexthop' to track fib6_info entries
> using a nexthop instance. Update __remove_nexthop_fib to walk f6_list
> and delete fib entries using the nexthop.
>
> Add a few nexthop helpers for use when a nexthop is added to fib6_info:
> - nexthop_fib6_nh - return first fib6_nh in a nexthop object
> - fib6_info_nh_dev moved to nexthop.h and updated to use nexthop_fib6_nh
>   if the fib6_info references a nexthop object
> - nexthop_path_fib6_result - similar to ipv4, select a path within a
>   multipath nexthop object. If the nexthop is a blackhole, set
>   fib6_result type to RTN_BLACKHOLE, and set the REJECT flag
>
> Update the fib6_info references to check for nh and take a different path
> as needed:
> - rt6_qualify_for_ecmp - if a fib entry uses a nexthop object it can NOT
>   be coalesced with other fib entries into a multipath route
> - rt6_duplicate_nexthop - use nexthop_cmp if either fib6_info references
>   a nexthop
> - addrconf (host routes), RA's and info entries (anything configured via
>   ndisc) does not use nexthop objects
> - fib6_info_destroy_rcu - put reference to nexthop object
> - fib6_purge_rt - drop fib6_info from f6i_list
> - fib6_select_path - update to use the new nexthop_path_fib6_result when
>   fib entry uses a nexthop object
> - rt6_device_match - update to catch use of nexthop object as a blackhole
>   and set fib6_type and flags.
> - ip6_pol_route - detect the REJECT flag getting set for blackhole nexthop
>   and jump to ip6_create_rt_rcu
> - ip6_route_info_create - don't add space for fib6_nh if fib entry is
>   going to reference a nexthop object, take a reference to nexthop object,
>   disallow use of source routing
> - rt6_nlmsg_size - add space for RTA_NH_ID
> - add rt6_fill_node_nexthop to add nexthop data on a dump
>
> As with ipv4, most of the changes push existing code into the else branch
> of whether the fib entry uses a nexthop object.
>
> Update the nexthop code to walk f6i_list on a nexthop deleted to remove
> fib entries referencing it.
>
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
>  include/net/ip6_fib.h   |  11 ++--
>  include/net/ip6_route.h |  13 +++-
>  include/net/nexthop.h   |  50 ++++++++++++++++
>  net/ipv4/nexthop.c      |  44 ++++++++++++++
>  net/ipv6/addrconf.c     |   5 ++
>  net/ipv6/ip6_fib.c      |  22 +++++--
>  net/ipv6/ndisc.c        |   3 +-
>  net/ipv6/route.c        | 156 +++++++++++++++++++++++++++++++++++++++++-------
>  8 files changed, 268 insertions(+), 36 deletions(-)
>
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index ebe5d65f97e0..1a8acd51b277 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -146,7 +146,10 @@ struct fib6_info {
>          * destination, but not the same gateway. nsiblings is just a cache
>          * to speed up lookup.
>          */
> -       struct list_head                fib6_siblings;
> +       union {
> +               struct list_head        fib6_siblings;
> +               struct list_head        nh_list;
> +       };
>         unsigned int                    fib6_nsiblings;
>
>         refcount_t                      fib6_ref;
> @@ -170,6 +173,7 @@ struct fib6_info {
>                                         unused:3;
>
>         struct rcu_head                 rcu;
> +       struct nexthop                  *nh;
>         struct fib6_nh                  fib6_nh[0];
>  };
>
> @@ -441,11 +445,6 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
>         rcu_read_unlock();
>  }
>
> -static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
> -{
> -       return f6i->fib6_nh->fib_nh_dev;
> -}
> -
>  int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
>                  struct fib6_config *cfg, gfp_t gfp_flags,
>                  struct netlink_ext_ack *extack);
> diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
> index a6ce6ea856b9..7375a165fd98 100644
> --- a/include/net/ip6_route.h
> +++ b/include/net/ip6_route.h
> @@ -27,6 +27,7 @@ struct route_info {
>  #include <linux/ip.h>
>  #include <linux/ipv6.h>
>  #include <linux/route.h>
> +#include <net/nexthop.h>
>
>  #define RT6_LOOKUP_F_IFACE             0x00000001
>  #define RT6_LOOKUP_F_REACHABLE         0x00000002
> @@ -66,10 +67,13 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
>                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
>  }
>
> +/* fib entries using a nexthop object can not be coalesced into
> + * a multipath route
> + */
>  static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
>  {
>         /* the RTF_ADDRCONF flag filters out RA's */
> -       return !(f6i->fib6_flags & RTF_ADDRCONF) &&
> +       return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh &&
>                 f6i->fib6_nh->fib_nh_gw_family;
>  }
>
> @@ -275,8 +279,13 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
>
>  static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
>  {
> -       struct fib6_nh *nha = a->fib6_nh, *nhb = b->fib6_nh;
> +       struct fib6_nh *nha, *nhb;
> +
> +       if (a->nh || b->nh)
> +               return nexthop_cmp(a->nh, b->nh);
>
> +       nha = a->fib6_nh;
> +       nhb = b->fib6_nh;
>         return nha->fib_nh_dev == nhb->fib_nh_dev &&
>                ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
>                !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
> diff --git a/include/net/nexthop.h b/include/net/nexthop.h
> index 2912a2d7a515..aff7b2410057 100644
> --- a/include/net/nexthop.h
> +++ b/include/net/nexthop.h
> @@ -10,6 +10,7 @@
>  #define __LINUX_NEXTHOP_H
>
>  #include <linux/netdevice.h>
> +#include <linux/route.h>
>  #include <linux/types.h>
>  #include <net/ip_fib.h>
>  #include <net/ip6_fib.h>
> @@ -78,6 +79,7 @@ struct nh_group {
>  struct nexthop {
>         struct rb_node          rb_node;    /* entry on netns rbtree */
>         struct list_head        fi_list;    /* v4 entries using nh */
> +       struct list_head        f6i_list;   /* v6 entries using nh */
>         struct list_head        grp_list;   /* nh group entries using this nh */
>         struct net              *net;
>
> @@ -255,4 +257,52 @@ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
>
>         return &fi->fib_nh[nhsel];
>  }
> +
> +/*
> + * IPv6 variants
> + */
> +int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
> +                      struct netlink_ext_ack *extack);
> +
> +static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
> +{
> +       struct nh_info *nhi;
> +
> +       if (nexthop_is_multipath(nh)) {
> +               nh = nexthop_mpath_select(nh, 0);
> +               if (!nh)
> +                       return NULL;
> +       }
> +
> +       nhi = rcu_dereference_rtnl(nh->nh_info);
> +       if (nhi->family == AF_INET6)
> +               return &nhi->fib6_nh;
> +
> +       return NULL;
> +}
> +
> +static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
> +{
> +       struct fib6_nh *fib6_nh;
> +
> +       fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh;
> +       return fib6_nh->fib_nh_dev;
> +}
> +
> +static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
> +{
> +       struct nexthop *nh = res->f6i->nh;
> +       struct nh_info *nhi;
> +
> +       nh = nexthop_select_path(nh, hash);
> +
> +       nhi = rcu_dereference_rtnl(nh->nh_info);
> +       if (nhi->reject_nh) {
> +               res->fib6_type = RTN_BLACKHOLE;
> +               res->fib6_flags |= RTF_REJECT;
> +               res->nh = nexthop_fib6_nh(nh);
> +       } else {
> +               res->nh = &nhi->fib6_nh;
> +       }
> +}
>  #endif
> diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
> index 63cbb04f697f..5e48762b6b5f 100644
> --- a/net/ipv4/nexthop.c
> +++ b/net/ipv4/nexthop.c
> @@ -106,6 +106,7 @@ static struct nexthop *nexthop_alloc(void)
>         nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
>         if (nh) {
>                 INIT_LIST_HEAD(&nh->fi_list);
> +               INIT_LIST_HEAD(&nh->f6i_list);
>                 INIT_LIST_HEAD(&nh->grp_list);
>         }
>         return nh;
> @@ -516,6 +517,41 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
>  }
>  EXPORT_SYMBOL_GPL(nexthop_select_path);
>
> +int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
> +                      struct netlink_ext_ack *extack)
> +{
> +       struct nh_info *nhi;
> +
> +       /* fib6_src is unique to a fib6_info and limits the ability to cache
> +        * routes in fib6_nh within a nexthop that is potentially shared
> +        * across multiple fib entries. If the config wants to use source
> +        * routing it can not use nexthop objects. mlxsw also does not allow
> +        * fib6_src on routes.
> +        */
> +       if (!ipv6_addr_any(&cfg->fc_src)) {
> +               NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
> +               return -EINVAL;
> +       }
> +
> +       if (nh->is_group) {
> +               struct nh_group *nhg;
> +
> +               nhg = rtnl_dereference(nh->nh_grp);
> +               if (nhg->has_v4)
> +                       goto no_v4_nh;
> +       } else {
> +               nhi = rtnl_dereference(nh->nh_info);
> +               if (nhi->family == AF_INET)
> +                       goto no_v4_nh;
> +       }
> +
> +       return 0;
> +no_v4_nh:
> +       NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
> +       return -EINVAL;
> +}
> +EXPORT_SYMBOL_GPL(fib6_check_nexthop);
> +
>  static int nexthop_check_scope(struct nexthop *nh, u8 scope,
>                                struct netlink_ext_ack *extack)
>  {
> @@ -658,6 +694,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
>
>  static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
>  {
> +       struct fib6_info *f6i, *tmp;
>         bool do_flush = false;
>         struct fib_info *fi;
>
> @@ -667,6 +704,13 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
>         }
>         if (do_flush)
>                 fib_flush(net);
> +
> +       /* ip6_del_rt removes the entry from this list hence the _safe */
> +       list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
> +               /* __ip6_del_rt does a release, so do a hold here */
> +               fib6_info_hold(f6i);

Do we need fib6_info_hold_safe() here?

>
> +               ipv6_stub->ip6_del_rt(net, f6i);
> +       }
>  }
>
>  static void __remove_nexthop(struct net *net, struct nexthop *nh,
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index 6b673d4f5ca9..7549e779335d 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -2421,6 +2421,10 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
>                 goto out;
>
>         for_each_fib6_node_rt_rcu(fn) {
> +               /* prefix routes only use builtin fib6_nh */
> +               if (rt->nh)
> +                       continue;
> +
>                 if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
>                         continue;
>                 if (no_gw && rt->fib6_nh->fib_nh_gw_family)
> @@ -6354,6 +6358,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
>         list_for_each_entry(ifa, &idev->addr_list, if_list) {
>                 spin_lock(&ifa->lock);
>                 if (ifa->rt) {
> +                       /* host routes only use builtin fib6_nh */
>                         struct fib6_nh *nh = ifa->rt->fib6_nh;
>                         int cpu;
>
> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> index cdfb8500ccae..02feda73a98e 100644
> --- a/net/ipv6/ip6_fib.c
> +++ b/net/ipv6/ip6_fib.c
> @@ -159,6 +159,7 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
>         if (!f6i)
>                 return NULL;
>
> +       /* fib6_siblings is a union with nh_list, so this initializes both */
>         INIT_LIST_HEAD(&f6i->fib6_siblings);
>         refcount_set(&f6i->fib6_ref, 1);
>
> @@ -171,7 +172,11 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
>
>         WARN_ON(f6i->fib6_node);
>
> -       fib6_nh_release(f6i->fib6_nh);
> +       if (f6i->nh)
> +               nexthop_put(f6i->nh);
> +       else
> +               fib6_nh_release(f6i->fib6_nh);
> +
>         ip_fib_metrics_put(f6i->fib6_metrics);
>         kfree(f6i);
>  }
> @@ -927,6 +932,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
>
>         fib6_drop_pcpu_from(rt, table);
>
> +       if (rt->nh && !list_empty(&rt->nh_list))
> +               list_del_init(&rt->nh_list);
> +
>         if (refcount_read(&rt->fib6_ref) != 1) {
>                 /* This route is used as dummy address holder in some split
>                  * nodes. It is not leaked, but it still holds other resources,
> @@ -1334,6 +1342,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
>
>         err = fib6_add_rt2node(fn, rt, info, extack);
>         if (!err) {
> +               if (rt->nh)
> +                       list_add(&rt->nh_list, &rt->nh->f6i_list);
>                 __fib6_update_sernum_upto_root(rt, sernum);
>                 fib6_start_gc(info->nl_net, rt);
>         }
> @@ -2295,9 +2305,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
>  {
>         struct fib6_info *rt = v;
>         struct ipv6_route_iter *iter = seq->private;
> +       struct fib6_nh *fib6_nh = rt->fib6_nh;
>         unsigned int flags = rt->fib6_flags;
>         const struct net_device *dev;
>
> +       if (rt->nh)
> +               fib6_nh = nexthop_fib6_nh(rt->nh);
> +
>         seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
>
>  #ifdef CONFIG_IPV6_SUBTREES
> @@ -2305,14 +2319,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
>  #else
>         seq_puts(seq, "00000000000000000000000000000000 00 ");
>  #endif
> -       if (rt->fib6_nh->fib_nh_gw_family) {
> +       if (fib6_nh->fib_nh_gw_family) {
>                 flags |= RTF_GATEWAY;
> -               seq_printf(seq, "%pi6", &rt->fib6_nh->fib_nh_gw6);
> +               seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
>         } else {
>                 seq_puts(seq, "00000000000000000000000000000000");
>         }
>
> -       dev = rt->fib6_nh->fib_nh_dev;
> +       dev = fib6_nh->fib_nh_dev;
>         seq_printf(seq, " %08x %08x %08x %08x %8s\n",
>                    rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
>                    flags, dev ? dev->name : "");
> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
> index f874dde1ee85..6e3c51109c83 100644
> --- a/net/ipv6/ndisc.c
> +++ b/net/ipv6/ndisc.c
> @@ -1289,9 +1289,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
>             !in6_dev->cnf.accept_ra_rtr_pref)
>                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
>  #endif
> -
> +       /* routes added from RAs do not use nexthop objects */
>         rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
> -
>         if (rt) {
>                 neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
>                                          rt->fib6_nh->fib_nh_dev, NULL,
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index fada5a13bcb2..51cb5cb027ae 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -432,15 +432,21 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
>         struct fib6_info *sibling, *next_sibling;
>         struct fib6_info *match = res->f6i;
>
> -       if (!match->fib6_nsiblings || have_oif_match)
> +       if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
>                 goto out;

So you mentioned fib6_nsiblings and nexthop is mutually exclusive. Is
it enforced from the configuration?

>
>
>         /* We might have already computed the hash for ICMPv6 errors. In such
>          * case it will always be non-zero. Otherwise now is the time to do it.
>          */
> -       if (!fl6->mp_hash)
> +       if (!fl6->mp_hash &&
> +           (!match->nh || nexthop_is_multipath(match->nh)))
>                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
>
> +       if (unlikely(match->nh)) {
> +               nexthop_path_fib6_result(res, fl6->mp_hash);
> +               return;
> +       }
> +
>         if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
>                 goto out;
>
> @@ -496,7 +502,13 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
>         struct fib6_nh *nh;
>
>         if (!oif && ipv6_addr_any(saddr)) {
> -               nh = f6i->fib6_nh;
> +               if (unlikely(f6i->nh)) {
> +                       nh = nexthop_fib6_nh(f6i->nh);
> +                       if (nexthop_is_blackhole(f6i->nh))
> +                               goto out_blackhole;
> +               } else {
> +                       nh = f6i->fib6_nh;
> +               }
>                 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
>                         goto out;
>         }
> @@ -515,7 +527,14 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
>                 goto out;
>         }
>
> -       nh = f6i->fib6_nh;
> +       if (unlikely(f6i->nh)) {
> +               nh = nexthop_fib6_nh(f6i->nh);
> +               if (nexthop_is_blackhole(f6i->nh))
> +                       goto out_blackhole;
> +       } else {
> +               nh = f6i->fib6_nh;
> +       }
> +
>         if (nh->fib_nh_flags & RTNH_F_DEAD) {
>                 res->f6i = net->ipv6.fib6_null_entry;
>                 nh = res->f6i->fib6_nh;
> @@ -524,6 +543,12 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
>         res->nh = nh;
>         res->fib6_type = res->f6i->fib6_type;
>         res->fib6_flags = res->f6i->fib6_flags;
> +       return;
> +
> +out_blackhole:
> +       res->fib6_flags |= RTF_REJECT;
> +       res->fib6_type = RTN_BLACKHOLE;
> +       res->nh = nh;
>  }
>
>  #ifdef CONFIG_IPV6_ROUTER_PREF
> @@ -1117,6 +1142,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
>                 rt = net->ipv6.ip6_null_entry;
>                 dst_hold(&rt->dst);
>                 goto out;
> +       } else if (res.fib6_flags & RTF_REJECT) {
> +               goto do_create;
>
>         }
>
>         fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
> @@ -1128,6 +1155,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
>                 if (ip6_hold_safe(net, &rt))
>                         dst_use_noref(&rt->dst, jiffies);
>         } else {
> +do_create:
>                 rt = ip6_create_rt_rcu(&res);
>         }
>
> @@ -1982,6 +2010,14 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
>                 rcu_read_unlock();
>                 dst_hold(&rt->dst);
>                 return rt;
> +       } else if (res.fib6_flags & RTF_REJECT) {
> +               rt = ip6_create_rt_rcu(&res);
> +               rcu_read_unlock();
> +               if (!rt) {
> +                       rt = net->ipv6.ip6_null_entry;
> +                       dst_hold(&rt->dst);
> +               }
> +               return rt;
>         }
>
Why do we need to call ip6_create_rt_rcu() to create a dst cache? Can
we directly return ip6_null_entry here? This route is anyway meant to
drop the packet. Same goes for the change in ip6_pol_route_lookup().

And for my education, how does this new nexthop logic interact with
the pcpu_rt cache and the exception table? Those 2 are currently
stored in struct fib6_nh. They are shared with fib6_siblings under the
same fib6_info. Are they also shared with nexthop for the same
fib6_info?
I don't see much changes around that area. So I assume they work as is?

>
>         fib6_select_path(net, &res, fl6, oif, false, skb, strict);
> @@ -3217,7 +3253,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
>  {
>         struct net *net = cfg->fc_nlinfo.nl_net;
>         struct fib6_info *rt = NULL;
> +       struct nexthop *nh = NULL;
>         struct fib6_table *table;
> +       struct fib6_nh *fib6_nh;
>         int err = -EINVAL;
>         int addr_type;
>
> @@ -3270,7 +3308,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
>                 goto out;
>
>         err = -ENOMEM;
> -       rt = fib6_info_alloc(gfp_flags, true);
> +       rt = fib6_info_alloc(gfp_flags, !nh);
>         if (!rt)
>                 goto out;
>
> @@ -3310,19 +3348,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
>         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
>         rt->fib6_src.plen = cfg->fc_src_len;
>  #endif
> -       err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
> -       if (err)
> -               goto out;
> +       if (nh) {
> +               if (!nexthop_get(nh)) {
> +                       NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
> +                       goto out;
> +               }
> +               if (rt->fib6_src.plen) {
> +                       NL_SET_ERR_MSG(extack, "Nexthops can not be used wtih source routing");
> +                       goto out;
> +               }
> +               rt->nh = nh;
> +               fib6_nh = nexthop_fib6_nh(rt->nh);
> +       } else {
> +               err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
> +               if (err)
> +                       goto out;
>
> -       /* We cannot add true routes via loopback here,
> -        * they would result in kernel looping; promote them to reject routes
> -        */
> -       addr_type = ipv6_addr_type(&cfg->fc_dst);
> -       if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type))
> -               rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
> +               fib6_nh = rt->fib6_nh;
> +
> +               /* We cannot add true routes via loopback here, they would
> +                * result in kernel looping; promote them to reject routes
> +                */
> +               addr_type = ipv6_addr_type(&cfg->fc_dst);
> +               if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
> +                                  addr_type))
> +                       rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
> +       }
>
>         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
> -               struct net_device *dev = fib6_info_nh_dev(rt);
> +               struct net_device *dev = fib6_nh->fib_nh_dev;
>
>                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
>                         NL_SET_ERR_MSG(extack, "Invalid source address");
> @@ -3678,6 +3732,9 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
>                 goto out;
>
>         for_each_fib6_node_rt_rcu(fn) {
> +               /* these routes do not use nexthops */
> +               if (rt->nh)
> +                       continue;
>                 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
>                         continue;
>                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
> @@ -3741,8 +3798,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
>
>         rcu_read_lock();
>         for_each_fib6_node_rt_rcu(&table->tb6_root) {
> -               struct fib6_nh *nh = rt->fib6_nh;
> +               struct fib6_nh *nh;
> +
> +               /* RA routes do not use nexthops */
> +               if (rt->nh)
> +                       continue;
>
> +               nh = rt->fib6_nh;
>                 if (dev == nh->fib_nh_dev &&
>                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
>                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
> @@ -3993,7 +4055,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
>         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
>         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
>
> -       if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
> +       if (!rt->nh &&
> +           ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
>             rt != net->ipv6.fib6_null_entry &&
>             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
>                 spin_lock_bh(&rt6_exception_lock);
> @@ -4021,8 +4084,13 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
>  static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
>  {
>         struct in6_addr *gateway = (struct in6_addr *)arg;
> -       struct fib6_nh *nh = rt->fib6_nh;
> +       struct fib6_nh *nh;
>
> +       /* RA routes do not use nexthops */
> +       if (rt->nh)
> +               return 0;
> +
> +       nh = rt->fib6_nh;
>         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
>             nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
>                 return -1;
> @@ -4069,6 +4137,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
>         return NULL;
>  }
>
> +/* only called for fib entries with builtin fib6_nh */
>  static bool rt6_is_dead(const struct fib6_info *rt)
>  {
>         if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
> @@ -4147,7 +4216,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
>         const struct arg_netdev_event *arg = p_arg;
>         struct net *net = dev_net(arg->dev);
>
> -       if (rt != net->ipv6.fib6_null_entry &&
> +       if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
>             rt->fib6_nh->fib_nh_dev == arg->dev) {
>                 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
>                 fib6_update_sernum_upto_root(net, rt);
> @@ -4172,6 +4241,7 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
>         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
>  }
>
> +/* only called for fib entries with inline fib6_nh */
>  static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
>                                    const struct net_device *dev)
>  {
> @@ -4232,7 +4302,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
>         const struct net_device *dev = arg->dev;
>         struct net *net = dev_net(dev);
>
> -       if (rt == net->ipv6.fib6_null_entry)
> +       if (rt == net->ipv6.fib6_null_entry || rt->nh)
>                 return 0;
>
>         switch (arg->event) {
> @@ -4786,6 +4856,9 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
>  {
>         int nexthop_len = 0;
>
> +       if (rt->nh)
> +               nexthop_len += nla_total_size(4); /* RTA_NH_ID */
> +
>         if (rt->fib6_nsiblings) {
>                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
>                             + NLA_ALIGN(sizeof(struct rtnexthop))
> @@ -4812,6 +4885,35 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
>                + nexthop_len;
>  }
>
> +static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
> +                                unsigned char *flags)
> +{
> +       if (nexthop_is_multipath(nh)) {
> +               struct nlattr *mp;
> +
> +               mp = nla_nest_start(skb, RTA_MULTIPATH);
> +               if (!mp)
> +                       goto nla_put_failure;
> +
> +               if (nexthop_mpath_fill_node(skb, nh))
> +                       goto nla_put_failure;
> +
> +               nla_nest_end(skb, mp);
> +       } else {
> +               struct fib6_nh *fib6_nh;
> +
> +               fib6_nh = nexthop_fib6_nh(nh);
> +               if (fib_nexthop_info(skb, &fib6_nh->nh_common,
> +                                    flags, false) < 0)
> +                       goto nla_put_failure;
> +       }
> +
> +       return 0;
> +
> +nla_put_failure:
> +       return -EMSGSIZE;
> +}
> +
>  static int rt6_fill_node(struct net *net, struct sk_buff *skb,
>                          struct fib6_info *rt, struct dst_entry *dst,
>                          struct in6_addr *dest, struct in6_addr *src,
> @@ -4821,6 +4923,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
>         struct rt6_info *rt6 = (struct rt6_info *)dst;
>         struct rt6key *rt6_dst, *rt6_src;
>         u32 *pmetrics, table, rt6_flags;
> +       unsigned char nh_flags = 0;
>         struct nlmsghdr *nlh;
>         struct rtmsg *rtm;
>         long expires = 0;
> @@ -4940,9 +5043,18 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
>                 }
>
>                 nla_nest_end(skb, mp);
> -       } else {
> -               unsigned char nh_flags = 0;
> +       } else if (rt->nh) {
> +               if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
> +                       goto nla_put_failure;
> +
> +               if (nexthop_is_blackhole(rt->nh))
> +                       rtm->rtm_type = RTN_BLACKHOLE;
>
> +               if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
> +                       goto nla_put_failure;
> +
> +               rtm->rtm_flags |= nh_flags;
> +       } else {
>                 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
>                                      &nh_flags, false) < 0)
>                         goto nla_put_failure;
> --
> 2.11.0
>
David Ahern June 3, 2019, 6:37 p.m. UTC | #2
On 6/3/19 12:09 PM, Wei Wang wrote:
>> @@ -667,6 +704,13 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
>>         }
>>         if (do_flush)
>>                 fib_flush(net);
>> +
>> +       /* ip6_del_rt removes the entry from this list hence the _safe */
>> +       list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
>> +               /* __ip6_del_rt does a release, so do a hold here */
>> +               fib6_info_hold(f6i);
> Do we need fib6_info_hold_safe() here?
> 

I do not think so.

If it is on the f6i_list, then fib6_purge_rt has not been called.
fib6_purge_rt and this function are both called with rtnl held, so there
is no race with the removal from the list.
David Ahern June 3, 2019, 8:42 p.m. UTC | #3
On 6/3/19 12:09 PM, Wei Wang wrote:
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index fada5a13bcb2..51cb5cb027ae 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -432,15 +432,21 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
>>         struct fib6_info *sibling, *next_sibling;
>>         struct fib6_info *match = res->f6i;
>>
>> -       if (!match->fib6_nsiblings || have_oif_match)
>> +       if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
>>                 goto out;
> 
> So you mentioned fib6_nsiblings and nexthop is mutually exclusive. Is
> it enforced from the configuration?

It is enforced by the patch that wires up RTA_NH_ID for IPv6.

>> @@ -1982,6 +2010,14 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
>>                 rcu_read_unlock();
>>                 dst_hold(&rt->dst);
>>                 return rt;
>> +       } else if (res.fib6_flags & RTF_REJECT) {
>> +               rt = ip6_create_rt_rcu(&res);
>> +               rcu_read_unlock();
>> +               if (!rt) {
>> +                       rt = net->ipv6.ip6_null_entry;
>> +                       dst_hold(&rt->dst);
>> +               }
>> +               return rt;
>>         }
>>
> Why do we need to call ip6_create_rt_rcu() to create a dst cache? Can
> we directly return ip6_null_entry here? This route is anyway meant to
> drop the packet. Same goes for the change in ip6_pol_route_lookup().

This is to mimic what happens if you added a route like this:
   ip -6 ro add blackhole 2001:db8:99::/64

except now the blackhole target is contained in the nexthop spec.


> 
> And for my education, how does this new nexthop logic interact with
> the pcpu_rt cache and the exception table? Those 2 are currently
> stored in struct fib6_nh. They are shared with fib6_siblings under the
> same fib6_info. Are they also shared with nexthop for the same
> fib6_info?

With nexthop objects IPv6 can work very similar to IPv4. Multiple IPv4
fib entries (fib_alias) can reference the same fib_info where the
fib_info contains an array of fib_nh and the cached routes and
exceptions are stored in the fib_nh.

The one IPv6 attribute that breaks the model is source routing which is
a function of the prefix. For that reason you can not use nexthop
objects with fib entries using source routing. See the note in this
patch in fib6_check_nexthop

> I don't see much changes around that area. So I assume they work as is?

Prior patch sets moved the pcpu and exception caches from fib6_info to
fib6_nh.
Wei Wang June 3, 2019, 9:58 p.m. UTC | #4
On Mon, Jun 3, 2019 at 1:42 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 6/3/19 12:09 PM, Wei Wang wrote:
> >> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> >> index fada5a13bcb2..51cb5cb027ae 100644
> >> --- a/net/ipv6/route.c
> >> +++ b/net/ipv6/route.c
> >> @@ -432,15 +432,21 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
> >>         struct fib6_info *sibling, *next_sibling;
> >>         struct fib6_info *match = res->f6i;
> >>
> >> -       if (!match->fib6_nsiblings || have_oif_match)
> >> +       if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
> >>                 goto out;
> >
> > So you mentioned fib6_nsiblings and nexthop is mutually exclusive. Is
> > it enforced from the configuration?
>
> It is enforced by the patch that wires up RTA_NH_ID for IPv6.
>
> >> @@ -1982,6 +2010,14 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
> >>                 rcu_read_unlock();
> >>                 dst_hold(&rt->dst);
> >>                 return rt;
> >> +       } else if (res.fib6_flags & RTF_REJECT) {
> >> +               rt = ip6_create_rt_rcu(&res);
> >> +               rcu_read_unlock();
> >> +               if (!rt) {
> >> +                       rt = net->ipv6.ip6_null_entry;
> >> +                       dst_hold(&rt->dst);
> >> +               }
> >> +               return rt;
> >>         }
> >>
> > Why do we need to call ip6_create_rt_rcu() to create a dst cache? Can
> > we directly return ip6_null_entry here? This route is anyway meant to
> > drop the packet. Same goes for the change in ip6_pol_route_lookup().
>
> This is to mimic what happens if you added a route like this:
>    ip -6 ro add blackhole 2001:db8:99::/64
>
> except now the blackhole target is contained in the nexthop spec.
>
>
Hmm... I am still a bit concerned with the ip6_create_rt_rcu() call.
If we have a blackholed nexthop, the lookup code here always tries to
create an rt cache entry for every lookup.
Maybe we could reuse the pcpu cache logic for this? So we only create
new dst cache on the CPU if there is no cache created before.

> >
> > And for my education, how does this new nexthop logic interact with
> > the pcpu_rt cache and the exception table? Those 2 are currently
> > stored in struct fib6_nh. They are shared with fib6_siblings under the
> > same fib6_info. Are they also shared with nexthop for the same
> > fib6_info?
>
> With nexthop objects IPv6 can work very similar to IPv4. Multiple IPv4
> fib entries (fib_alias) can reference the same fib_info where the
> fib_info contains an array of fib_nh and the cached routes and
> exceptions are stored in the fib_nh.
>
> The one IPv6 attribute that breaks the model is source routing which is
> a function of the prefix. For that reason you can not use nexthop
> objects with fib entries using source routing. See the note in this
> patch in fib6_check_nexthop
>
> > I don't see much changes around that area. So I assume they work as is?
>
> Prior patch sets moved the pcpu and exception caches from fib6_info to
> fib6_nh.

Understood. Basically, for every nexthop object, it will have its own
fib6_nh which contains pcpu_rt and exception table.
And we only use the built-in fib6_nh in struct fib6_info when nexthop
is not used.
And if src addr routing is used, then nexthop object will not be used.
David Ahern June 3, 2019, 10:35 p.m. UTC | #5
On 6/3/19 3:58 PM, Wei Wang wrote:
> Hmm... I am still a bit concerned with the ip6_create_rt_rcu() call.
> If we have a blackholed nexthop, the lookup code here always tries to
> create an rt cache entry for every lookup.
> Maybe we could reuse the pcpu cache logic for this? So we only create
> new dst cache on the CPU if there is no cache created before.

I'll take a look.

Long term, I would like to see IPv6 separate FIB lookups from dst's -
like IPv4 does. In that case reject routes would not use a dst_entry;
rather the fib lookups return an error code.
Wei Wang June 3, 2019, 11:05 p.m. UTC | #6
On Mon, Jun 3, 2019 at 3:35 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 6/3/19 3:58 PM, Wei Wang wrote:
> > Hmm... I am still a bit concerned with the ip6_create_rt_rcu() call.
> > If we have a blackholed nexthop, the lookup code here always tries to
> > create an rt cache entry for every lookup.
> > Maybe we could reuse the pcpu cache logic for this? So we only create
> > new dst cache on the CPU if there is no cache created before.
>
> I'll take a look.
>
> Long term, I would like to see IPv6 separate FIB lookups from dst's -
> like IPv4 does. In that case reject routes would not use a dst_entry;
> rather the fib lookups return an error code.
Yes. Agree. That will be even better.
David Ahern June 3, 2019, 11:18 p.m. UTC | #7
On 6/3/19 5:05 PM, Wei Wang wrote:
> On Mon, Jun 3, 2019 at 3:35 PM David Ahern <dsahern@gmail.com> wrote:
>>
>> On 6/3/19 3:58 PM, Wei Wang wrote:
>>> Hmm... I am still a bit concerned with the ip6_create_rt_rcu() call.
>>> If we have a blackholed nexthop, the lookup code here always tries to
>>> create an rt cache entry for every lookup.
>>> Maybe we could reuse the pcpu cache logic for this? So we only create
>>> new dst cache on the CPU if there is no cache created before.
>>
>> I'll take a look.
>>

BTW, I am only updating ip6_pol_route to use pcpu routes for blackhole
nexthops.

ip6_pol_route_lookup will continue as is. That function does not use
pcpu routes and will stay as is.
Wei Wang June 3, 2019, 11:30 p.m. UTC | #8
On Mon, Jun 3, 2019 at 4:18 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 6/3/19 5:05 PM, Wei Wang wrote:
> > On Mon, Jun 3, 2019 at 3:35 PM David Ahern <dsahern@gmail.com> wrote:
> >>
> >> On 6/3/19 3:58 PM, Wei Wang wrote:
> >>> Hmm... I am still a bit concerned with the ip6_create_rt_rcu() call.
> >>> If we have a blackholed nexthop, the lookup code here always tries to
> >>> create an rt cache entry for every lookup.
> >>> Maybe we could reuse the pcpu cache logic for this? So we only create
> >>> new dst cache on the CPU if there is no cache created before.
> >>
> >> I'll take a look.
> >>
>
> BTW, I am only updating ip6_pol_route to use pcpu routes for blackhole
> nexthops.
>
> ip6_pol_route_lookup will continue as is. That function does not use
> pcpu routes and will stay as is.
>
That sounds reasonable to me. ip6_pol_route() seems to be what we care
about the most as all incoming/outgoing data path uses that.
Thanks David.
Martin KaFai Lau June 4, 2019, 12:58 a.m. UTC | #9
On Mon, Jun 03, 2019 at 05:18:11PM -0600, David Ahern wrote:
> On 6/3/19 5:05 PM, Wei Wang wrote:
> > On Mon, Jun 3, 2019 at 3:35 PM David Ahern <dsahern@gmail.com> wrote:
> >>
> >> On 6/3/19 3:58 PM, Wei Wang wrote:
> >>> Hmm... I am still a bit concerned with the ip6_create_rt_rcu() call.
> >>> If we have a blackholed nexthop, the lookup code here always tries to
> >>> create an rt cache entry for every lookup.
> >>> Maybe we could reuse the pcpu cache logic for this? So we only create
> >>> new dst cache on the CPU if there is no cache created before.
> >>
> >> I'll take a look.
> >>
> 
> BTW, I am only updating ip6_pol_route to use pcpu routes for blackhole
> nexthops.
> 
> ip6_pol_route_lookup will continue as is. That function does not use
> pcpu routes and will stay as is.
> 
I have concern on calling ip6_create_rt_rcu() in general which seems
to trace back to this commit
dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")

This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
In particular, how to react to NETDEV_UNREGISTER/DOWN like
the rt6_uncached_list_flush_dev() does and calls dev_put()?

The existing callers seem to do dst_release() immediately without
caching it, but still concerning.
David Ahern June 4, 2019, 1:36 a.m. UTC | #10
On 6/3/19 6:58 PM, Martin Lau wrote:
> I have concern on calling ip6_create_rt_rcu() in general which seems
> to trace back to this commit
> dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")
> 
> This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
> In particular, how to react to NETDEV_UNREGISTER/DOWN like
> the rt6_uncached_list_flush_dev() does and calls dev_put()?
> 
> The existing callers seem to do dst_release() immediately without
> caching it, but still concerning.

those are the callers that don't care about the dst_entry, but are
forced to deal with it. Removing the tie between fib lookups an
dst_entry is again the right solution.
Martin KaFai Lau June 4, 2019, 5:29 a.m. UTC | #11
On Mon, Jun 03, 2019 at 07:36:06PM -0600, David Ahern wrote:
> On 6/3/19 6:58 PM, Martin Lau wrote:
> > I have concern on calling ip6_create_rt_rcu() in general which seems
> > to trace back to this commit
> > dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")
> > 
> > This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
> > In particular, how to react to NETDEV_UNREGISTER/DOWN like
> > the rt6_uncached_list_flush_dev() does and calls dev_put()?
> > 
> > The existing callers seem to do dst_release() immediately without
> > caching it, but still concerning.
> 
> those are the callers that don't care about the dst_entry, but are
> forced to deal with it. Removing the tie between fib lookups an
> dst_entry is again the right solution.
Great to know that there will be a solution.  It would be great
if there is patch (or repo) to show how that may look like on
those rt6_lookup() callers.

Before that,
although it seems fine now (__ip6_route_redirect() is
harder to confirm since rt is passed around but it
seems to be ok),
instead of risking for "unregister_netdevice: waiting for eth0 to become free"
in case some future patch is caching this rt,
why pcpu_rt cannot be used in all occasions? and also
avoid re-creating the same rt.
David Ahern June 4, 2019, 8:17 p.m. UTC | #12
On 6/3/19 11:29 PM, Martin Lau wrote:
> On Mon, Jun 03, 2019 at 07:36:06PM -0600, David Ahern wrote:
>> On 6/3/19 6:58 PM, Martin Lau wrote:
>>> I have concern on calling ip6_create_rt_rcu() in general which seems
>>> to trace back to this commit
>>> dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")
>>>
>>> This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
>>> In particular, how to react to NETDEV_UNREGISTER/DOWN like
>>> the rt6_uncached_list_flush_dev() does and calls dev_put()?
>>>
>>> The existing callers seem to do dst_release() immediately without
>>> caching it, but still concerning.
>>
>> those are the callers that don't care about the dst_entry, but are
>> forced to deal with it. Removing the tie between fib lookups an
>> dst_entry is again the right solution.
> Great to know that there will be a solution.  It would be great
> if there is patch (or repo) to show how that may look like on
> those rt6_lookup() callers.

Not 'will be', 'there is' a solution now. Someone just needs to do the
conversions and devise the tests for the impacted users.
Martin KaFai Lau June 4, 2019, 9:06 p.m. UTC | #13
On Tue, Jun 04, 2019 at 02:17:28PM -0600, David Ahern wrote:
> On 6/3/19 11:29 PM, Martin Lau wrote:
> > On Mon, Jun 03, 2019 at 07:36:06PM -0600, David Ahern wrote:
> >> On 6/3/19 6:58 PM, Martin Lau wrote:
> >>> I have concern on calling ip6_create_rt_rcu() in general which seems
> >>> to trace back to this commit
> >>> dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")
> >>>
> >>> This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
> >>> In particular, how to react to NETDEV_UNREGISTER/DOWN like
> >>> the rt6_uncached_list_flush_dev() does and calls dev_put()?
> >>>
> >>> The existing callers seem to do dst_release() immediately without
> >>> caching it, but still concerning.
> >>
> >> those are the callers that don't care about the dst_entry, but are
> >> forced to deal with it. Removing the tie between fib lookups an
> >> dst_entry is again the right solution.
> > Great to know that there will be a solution.  It would be great
> > if there is patch (or repo) to show how that may look like on
> > those rt6_lookup() callers.
> 
> Not 'will be', 'there is' a solution now. Someone just needs to do the
> conversions and devise the tests for the impacted users.
I don't think everyone will convert to the new nexthop solution
immediately.

How about ensuring the existing usage stays solid first?
>> Before that,
>> although it seems fine now (__ip6_route_redirect() is
>> harder to confirm since rt is passed around but it
>> seems to be ok),
>> instead of risking for "unregister_netdevice: waiting for eth0 to become free"
>> in case some future patch is caching this rt,
>> why pcpu_rt cannot be used in all occasions? and also
>> avoid re-creating the same rt.
David Ahern June 4, 2019, 9:13 p.m. UTC | #14
On 6/4/19 3:06 PM, Martin Lau wrote:
> On Tue, Jun 04, 2019 at 02:17:28PM -0600, David Ahern wrote:
>> On 6/3/19 11:29 PM, Martin Lau wrote:
>>> On Mon, Jun 03, 2019 at 07:36:06PM -0600, David Ahern wrote:
>>>> On 6/3/19 6:58 PM, Martin Lau wrote:
>>>>> I have concern on calling ip6_create_rt_rcu() in general which seems
>>>>> to trace back to this commit
>>>>> dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")
>>>>>
>>>>> This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
>>>>> In particular, how to react to NETDEV_UNREGISTER/DOWN like
>>>>> the rt6_uncached_list_flush_dev() does and calls dev_put()?
>>>>>
>>>>> The existing callers seem to do dst_release() immediately without
>>>>> caching it, but still concerning.
>>>>
>>>> those are the callers that don't care about the dst_entry, but are
>>>> forced to deal with it. Removing the tie between fib lookups an
>>>> dst_entry is again the right solution.
>>> Great to know that there will be a solution.  It would be great
>>> if there is patch (or repo) to show how that may look like on
>>> those rt6_lookup() callers.
>>
>> Not 'will be', 'there is' a solution now. Someone just needs to do the
>> conversions and devise the tests for the impacted users.
> I don't think everyone will convert to the new nexthop solution
> immediately.
> 
> How about ensuring the existing usage stays solid first?

Use of nexthop objects has nothing to do with separating fib lookups
from dst_entries, but with the addition of fib6_result it Just Works.

Wei converted ipv6 to use exception caches instead of adding them to the
FIB.

I converted ipv6 to use separate data structures for fib entries, added
direct fib6 lookup functions and added fib6_result. See the
net/core/filter.c.

The stage is set for converting users.

For example, ip6_nh_lookup_table does not care about the dst entry, only
the fib entry. This converts it:

static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
                               const struct in6_addr *gw_addr, u32 tbid,
                               int flags, struct fib6_result *res)
{
        struct flowi6 fl6 = {
                .flowi6_oif = cfg->fc_ifindex,
                .daddr = *gw_addr,
                .saddr = cfg->fc_prefsrc,
        };
        struct fib6_table *table;
        struct rt6_info *rt;

        table = fib6_get_table(net, tbid);
        if (!table)
                return -EINVAL;

        if (!ipv6_addr_any(&cfg->fc_prefsrc))
                flags |= RT6_LOOKUP_F_HAS_SADDR;

        flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;

        fib6_table_lookup(net, table, cfg->fc_ifindex, fl6, res, flags);
        if (res.f6i == net->ipv6.fib6_null_entry)
                return -ENETUNREACH;

        fib6_select_path(net, &res, fl6, oif, false, NULL, flags);

        return 0;
}
Wei Wang June 4, 2019, 9:36 p.m. UTC | #15
On Tue, Jun 4, 2019 at 2:13 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 6/4/19 3:06 PM, Martin Lau wrote:
> > On Tue, Jun 04, 2019 at 02:17:28PM -0600, David Ahern wrote:
> >> On 6/3/19 11:29 PM, Martin Lau wrote:
> >>> On Mon, Jun 03, 2019 at 07:36:06PM -0600, David Ahern wrote:
> >>>> On 6/3/19 6:58 PM, Martin Lau wrote:
> >>>>> I have concern on calling ip6_create_rt_rcu() in general which seems
> >>>>> to trace back to this commit
> >>>>> dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")
> >>>>>
> >>>>> This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
> >>>>> In particular, how to react to NETDEV_UNREGISTER/DOWN like
> >>>>> the rt6_uncached_list_flush_dev() does and calls dev_put()?
> >>>>>
> >>>>> The existing callers seem to do dst_release() immediately without
> >>>>> caching it, but still concerning.
> >>>>
> >>>> those are the callers that don't care about the dst_entry, but are
> >>>> forced to deal with it. Removing the tie between fib lookups an
> >>>> dst_entry is again the right solution.
> >>> Great to know that there will be a solution.  It would be great
> >>> if there is patch (or repo) to show how that may look like on
> >>> those rt6_lookup() callers.
> >>
> >> Not 'will be', 'there is' a solution now. Someone just needs to do the
> >> conversions and devise the tests for the impacted users.
> > I don't think everyone will convert to the new nexthop solution
> > immediately.
> >
> > How about ensuring the existing usage stays solid first?
>
> Use of nexthop objects has nothing to do with separating fib lookups
> from dst_entries, but with the addition of fib6_result it Just Works.
>
> Wei converted ipv6 to use exception caches instead of adding them to the
> FIB.
>
> I converted ipv6 to use separate data structures for fib entries, added
> direct fib6 lookup functions and added fib6_result. See the
> net/core/filter.c.
>
> The stage is set for converting users.
>
> For example, ip6_nh_lookup_table does not care about the dst entry, only
> the fib entry. This converts it:
>
> static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
>                                const struct in6_addr *gw_addr, u32 tbid,
>                                int flags, struct fib6_result *res)
> {
>         struct flowi6 fl6 = {
>                 .flowi6_oif = cfg->fc_ifindex,
>                 .daddr = *gw_addr,
>                 .saddr = cfg->fc_prefsrc,
>         };
>         struct fib6_table *table;
>         struct rt6_info *rt;
>
>         table = fib6_get_table(net, tbid);
>         if (!table)
>                 return -EINVAL;
>
>         if (!ipv6_addr_any(&cfg->fc_prefsrc))
>                 flags |= RT6_LOOKUP_F_HAS_SADDR;
>
>         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
>
>         fib6_table_lookup(net, table, cfg->fc_ifindex, fl6, res, flags);
>         if (res.f6i == net->ipv6.fib6_null_entry)
>                 return -ENETUNREACH;
>
>         fib6_select_path(net, &res, fl6, oif, false, NULL, flags);
>
>         return 0;
> }

I do agree with Martin that ip6_create_rt_rcu() seems to be dangerous
as the dst cache created by this func does not get tracked anywhere
and it is up to the user to not cache it for too long.
But I think David, what you are suggesting is:
instead of trying to convert ip6_create_rt_rcu() to use the pcpu_dst
logic, completely get rid of the calling to ip6_create_rt_rcu(), and
directly return f6i in those cases to the caller. Is that correct?
Martin KaFai Lau June 4, 2019, 9:53 p.m. UTC | #16
On Tue, Jun 04, 2019 at 03:13:38PM -0600, David Ahern wrote:
> On 6/4/19 3:06 PM, Martin Lau wrote:
> > On Tue, Jun 04, 2019 at 02:17:28PM -0600, David Ahern wrote:
> >> On 6/3/19 11:29 PM, Martin Lau wrote:
> >>> On Mon, Jun 03, 2019 at 07:36:06PM -0600, David Ahern wrote:
> >>>> On 6/3/19 6:58 PM, Martin Lau wrote:
> >>>>> I have concern on calling ip6_create_rt_rcu() in general which seems
> >>>>> to trace back to this commit
> >>>>> dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")
> >>>>>
> >>>>> This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
> >>>>> In particular, how to react to NETDEV_UNREGISTER/DOWN like
> >>>>> the rt6_uncached_list_flush_dev() does and calls dev_put()?
> >>>>>
> >>>>> The existing callers seem to do dst_release() immediately without
> >>>>> caching it, but still concerning.
> >>>>
> >>>> those are the callers that don't care about the dst_entry, but are
> >>>> forced to deal with it. Removing the tie between fib lookups an
> >>>> dst_entry is again the right solution.
> >>> Great to know that there will be a solution.  It would be great
> >>> if there is patch (or repo) to show how that may look like on
> >>> those rt6_lookup() callers.
> >>
> >> Not 'will be', 'there is' a solution now. Someone just needs to do the
> >> conversions and devise the tests for the impacted users.
> > I don't think everyone will convert to the new nexthop solution
> > immediately.
> > 
> > How about ensuring the existing usage stays solid first?
> 
> Use of nexthop objects has nothing to do with separating fib lookups
> from dst_entries, but with the addition of fib6_result it Just Works.
> 
> Wei converted ipv6 to use exception caches instead of adding them to the
> FIB.
> 
> I converted ipv6 to use separate data structures for fib entries, added
> direct fib6 lookup functions and added fib6_result. See the
> net/core/filter.c.
> 
> The stage is set for converting users.
> 
> For example, ip6_nh_lookup_table does not care about the dst entry, only
> the fib entry. This converts it:
> 
> static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
>                                const struct in6_addr *gw_addr, u32 tbid,
>                                int flags, struct fib6_result *res)
> {
>         struct flowi6 fl6 = {
>                 .flowi6_oif = cfg->fc_ifindex,
>                 .daddr = *gw_addr,
>                 .saddr = cfg->fc_prefsrc,
>         };
>         struct fib6_table *table;
>         struct rt6_info *rt;
> 
>         table = fib6_get_table(net, tbid);
>         if (!table)
>                 return -EINVAL;
> 
>         if (!ipv6_addr_any(&cfg->fc_prefsrc))
>                 flags |= RT6_LOOKUP_F_HAS_SADDR;
> 
>         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
> 
>         fib6_table_lookup(net, table, cfg->fc_ifindex, fl6, res, flags);
>         if (res.f6i == net->ipv6.fib6_null_entry)
>                 return -ENETUNREACH;
> 
>         fib6_select_path(net, &res, fl6, oif, false, NULL, flags);
> 
>         return 0;
> }
Thanks for the example.  This patch/example is what I am looking for.
These changes are still "will be" done.

AFAICT, there are more than 10 rt6_lookup() usages which may end up calling
ip6_create_rt_rcu() and return an untracked rt.
It is very difficult to audit the future changes on them to ensure they will
not cache them till the above suggested changes to be done or
completely remove ip6_create_rt_rcu() for now.
David Ahern June 4, 2019, 11:30 p.m. UTC | #17
On 6/4/19 3:36 PM, Wei Wang wrote:
> instead of trying to convert ip6_create_rt_rcu() to use the pcpu_dst
> logic, completely get rid of the calling to ip6_create_rt_rcu(), and
> directly return f6i in those cases to the caller. Is that correct?

yes
Martin KaFai Lau June 5, 2019, 12:39 a.m. UTC | #18
On Tue, Jun 04, 2019 at 02:36:27PM -0700, Wei Wang wrote:
> On Tue, Jun 4, 2019 at 2:13 PM David Ahern <dsahern@gmail.com> wrote:
> >
> > On 6/4/19 3:06 PM, Martin Lau wrote:
> > > On Tue, Jun 04, 2019 at 02:17:28PM -0600, David Ahern wrote:
> > >> On 6/3/19 11:29 PM, Martin Lau wrote:
> > >>> On Mon, Jun 03, 2019 at 07:36:06PM -0600, David Ahern wrote:
> > >>>> On 6/3/19 6:58 PM, Martin Lau wrote:
> > >>>>> I have concern on calling ip6_create_rt_rcu() in general which seems
> > >>>>> to trace back to this commit
> > >>>>> dec9b0e295f6 ("net/ipv6: Add rt6_info create function for ip6_pol_route_lookup")
> > >>>>>
> > >>>>> This rt is not tracked in pcpu_rt, rt6_uncached_list or exception bucket.
> > >>>>> In particular, how to react to NETDEV_UNREGISTER/DOWN like
> > >>>>> the rt6_uncached_list_flush_dev() does and calls dev_put()?
> > >>>>>
> > >>>>> The existing callers seem to do dst_release() immediately without
> > >>>>> caching it, but still concerning.
> > >>>>
> > >>>> those are the callers that don't care about the dst_entry, but are
> > >>>> forced to deal with it. Removing the tie between fib lookups an
> > >>>> dst_entry is again the right solution.
> > >>> Great to know that there will be a solution.  It would be great
> > >>> if there is patch (or repo) to show how that may look like on
> > >>> those rt6_lookup() callers.
> > >>
> > >> Not 'will be', 'there is' a solution now. Someone just needs to do the
> > >> conversions and devise the tests for the impacted users.
> > > I don't think everyone will convert to the new nexthop solution
> > > immediately.
> > >
> > > How about ensuring the existing usage stays solid first?
> >
> > Use of nexthop objects has nothing to do with separating fib lookups
> > from dst_entries, but with the addition of fib6_result it Just Works.
> >
> > Wei converted ipv6 to use exception caches instead of adding them to the
> > FIB.
> >
> > I converted ipv6 to use separate data structures for fib entries, added
> > direct fib6 lookup functions and added fib6_result. See the
> > net/core/filter.c.
> >
> > The stage is set for converting users.
> >
> > For example, ip6_nh_lookup_table does not care about the dst entry, only
> > the fib entry. This converts it:
> >
> > static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
> >                                const struct in6_addr *gw_addr, u32 tbid,
> >                                int flags, struct fib6_result *res)
> > {
> >         struct flowi6 fl6 = {
> >                 .flowi6_oif = cfg->fc_ifindex,
> >                 .daddr = *gw_addr,
> >                 .saddr = cfg->fc_prefsrc,
> >         };
> >         struct fib6_table *table;
> >         struct rt6_info *rt;
> >
> >         table = fib6_get_table(net, tbid);
> >         if (!table)
> >                 return -EINVAL;
> >
> >         if (!ipv6_addr_any(&cfg->fc_prefsrc))
> >                 flags |= RT6_LOOKUP_F_HAS_SADDR;
> >
> >         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
> >
> >         fib6_table_lookup(net, table, cfg->fc_ifindex, fl6, res, flags);
> >         if (res.f6i == net->ipv6.fib6_null_entry)
> >                 return -ENETUNREACH;
> >
> >         fib6_select_path(net, &res, fl6, oif, false, NULL, flags);
> >
> >         return 0;
> > }
> 
> I do agree with Martin that ip6_create_rt_rcu() seems to be dangerous
> as the dst cache created by this func does not get tracked anywhere
> and it is up to the user to not cache it for too long.
IMO, ip6_create_rt_rcu(), which returns untracked rt, was a mistake
and removing it has been overdue.  Tracking down the unregister dev
bug is not easy.

> But I think David, what you are suggesting is:
> instead of trying to convert ip6_create_rt_rcu() to use the pcpu_dst
> logic, completely get rid of the calling to ip6_create_rt_rcu(), and
> directly return f6i in those cases to the caller. Is that correct?
I am fine with either of these two ways to remove ip6_create_rt_rcu().
Further depending on ip6_create_rt_rcu() in this patch even in
ip6_pol_route_lookup() is arguably neither of these two ways...
David Ahern June 5, 2019, 2:05 a.m. UTC | #19
On 6/4/19 6:39 PM, Martin Lau wrote:
> IMO, ip6_create_rt_rcu(), which returns untracked rt, was a mistake
> and removing it has been overdue.  Tracking down the unregister dev
> bug is not easy.

I must be missing something because I don't have the foggiest idea why
you are barking up this tree.

If code calls a function that returns a dst_entry with a refcount taken,
that code is responsible for releasing it. Using a pcpu cached dst
versus a new one in no way tells you who took the dst and bumped the
refcnt on the netdev. Either way the dst refcount is bumped. Tracking
netdev refcnt is the only way to methodically figure it out.

What am I overlooking here?
Martin KaFai Lau June 5, 2019, 6:01 a.m. UTC | #20
On Tue, Jun 04, 2019 at 08:05:58PM -0600, David Ahern wrote:
> On 6/4/19 6:39 PM, Martin Lau wrote:
> > IMO, ip6_create_rt_rcu(), which returns untracked rt, was a mistake
> > and removing it has been overdue.  Tracking down the unregister dev
> > bug is not easy.
> 
> I must be missing something because I don't have the foggiest idea why
> you are barking up this tree.
> 
> If code calls a function that returns a dst_entry with a refcount taken,
> that code is responsible for releasing it.
The code is responsible but there is no control on when.
That code can cache it for a long time.  May be re-look at the dev_put() in
this recent bug fix to begin with?
f5b51fe804ec ("ipv6: route: purge exception on removal")

and also the current rt6_uncached_list + rt6_uncached_list_flush_dev()

> Using a pcpu cached dst
> versus a new one in no way tells you who took the dst and bumped the
> refcnt on the netdev. Either way the dst refcount is bumped. Tracking
> netdev refcnt is the only way to methodically figure it out.
> 
> What am I overlooking here?
diff mbox series

Patch

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index ebe5d65f97e0..1a8acd51b277 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -146,7 +146,10 @@  struct fib6_info {
 	 * destination, but not the same gateway. nsiblings is just a cache
 	 * to speed up lookup.
 	 */
-	struct list_head		fib6_siblings;
+	union {
+		struct list_head	fib6_siblings;
+		struct list_head	nh_list;
+	};
 	unsigned int			fib6_nsiblings;
 
 	refcount_t			fib6_ref;
@@ -170,6 +173,7 @@  struct fib6_info {
 					unused:3;
 
 	struct rcu_head			rcu;
+	struct nexthop			*nh;
 	struct fib6_nh			fib6_nh[0];
 };
 
@@ -441,11 +445,6 @@  void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
 	rcu_read_unlock();
 }
 
-static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
-{
-	return f6i->fib6_nh->fib_nh_dev;
-}
-
 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 		 struct fib6_config *cfg, gfp_t gfp_flags,
 		 struct netlink_ext_ack *extack);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index a6ce6ea856b9..7375a165fd98 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -27,6 +27,7 @@  struct route_info {
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/route.h>
+#include <net/nexthop.h>
 
 #define RT6_LOOKUP_F_IFACE		0x00000001
 #define RT6_LOOKUP_F_REACHABLE		0x00000002
@@ -66,10 +67,13 @@  static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+/* fib entries using a nexthop object can not be coalesced into
+ * a multipath route
+ */
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
 	/* the RTF_ADDRCONF flag filters out RA's */
-	return !(f6i->fib6_flags & RTF_ADDRCONF) &&
+	return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh &&
 		f6i->fib6_nh->fib_nh_gw_family;
 }
 
@@ -275,8 +279,13 @@  static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
-	struct fib6_nh *nha = a->fib6_nh, *nhb = b->fib6_nh;
+	struct fib6_nh *nha, *nhb;
+
+	if (a->nh || b->nh)
+		return nexthop_cmp(a->nh, b->nh);
 
+	nha = a->fib6_nh;
+	nhb = b->fib6_nh;
 	return nha->fib_nh_dev == nhb->fib_nh_dev &&
 	       ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
 	       !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 2912a2d7a515..aff7b2410057 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -10,6 +10,7 @@ 
 #define __LINUX_NEXTHOP_H
 
 #include <linux/netdevice.h>
+#include <linux/route.h>
 #include <linux/types.h>
 #include <net/ip_fib.h>
 #include <net/ip6_fib.h>
@@ -78,6 +79,7 @@  struct nh_group {
 struct nexthop {
 	struct rb_node		rb_node;    /* entry on netns rbtree */
 	struct list_head	fi_list;    /* v4 entries using nh */
+	struct list_head	f6i_list;   /* v6 entries using nh */
 	struct list_head	grp_list;   /* nh group entries using this nh */
 	struct net		*net;
 
@@ -255,4 +257,52 @@  static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
 
 	return &fi->fib_nh[nhsel];
 }
+
+/*
+ * IPv6 variants
+ */
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+		       struct netlink_ext_ack *extack);
+
+static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
+{
+	struct nh_info *nhi;
+
+	if (nexthop_is_multipath(nh)) {
+		nh = nexthop_mpath_select(nh, 0);
+		if (!nh)
+			return NULL;
+	}
+
+	nhi = rcu_dereference_rtnl(nh->nh_info);
+	if (nhi->family == AF_INET6)
+		return &nhi->fib6_nh;
+
+	return NULL;
+}
+
+static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
+{
+	struct fib6_nh *fib6_nh;
+
+	fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh;
+	return fib6_nh->fib_nh_dev;
+}
+
+static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
+{
+	struct nexthop *nh = res->f6i->nh;
+	struct nh_info *nhi;
+
+	nh = nexthop_select_path(nh, hash);
+
+	nhi = rcu_dereference_rtnl(nh->nh_info);
+	if (nhi->reject_nh) {
+		res->fib6_type = RTN_BLACKHOLE;
+		res->fib6_flags |= RTF_REJECT;
+		res->nh = nexthop_fib6_nh(nh);
+	} else {
+		res->nh = &nhi->fib6_nh;
+	}
+}
 #endif
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 63cbb04f697f..5e48762b6b5f 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -106,6 +106,7 @@  static struct nexthop *nexthop_alloc(void)
 	nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 	if (nh) {
 		INIT_LIST_HEAD(&nh->fi_list);
+		INIT_LIST_HEAD(&nh->f6i_list);
 		INIT_LIST_HEAD(&nh->grp_list);
 	}
 	return nh;
@@ -516,6 +517,41 @@  struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
 }
 EXPORT_SYMBOL_GPL(nexthop_select_path);
 
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+		       struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+
+	/* fib6_src is unique to a fib6_info and limits the ability to cache
+	 * routes in fib6_nh within a nexthop that is potentially shared
+	 * across multiple fib entries. If the config wants to use source
+	 * routing it can not use nexthop objects. mlxsw also does not allow
+	 * fib6_src on routes.
+	 */
+	if (!ipv6_addr_any(&cfg->fc_src)) {
+		NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
+		return -EINVAL;
+	}
+
+	if (nh->is_group) {
+		struct nh_group *nhg;
+
+		nhg = rtnl_dereference(nh->nh_grp);
+		if (nhg->has_v4)
+			goto no_v4_nh;
+	} else {
+		nhi = rtnl_dereference(nh->nh_info);
+		if (nhi->family == AF_INET)
+			goto no_v4_nh;
+	}
+
+	return 0;
+no_v4_nh:
+	NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(fib6_check_nexthop);
+
 static int nexthop_check_scope(struct nexthop *nh, u8 scope,
 			       struct netlink_ext_ack *extack)
 {
@@ -658,6 +694,7 @@  static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
 
 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 {
+	struct fib6_info *f6i, *tmp;
 	bool do_flush = false;
 	struct fib_info *fi;
 
@@ -667,6 +704,13 @@  static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 	}
 	if (do_flush)
 		fib_flush(net);
+
+	/* ip6_del_rt removes the entry from this list hence the _safe */
+	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+		/* __ip6_del_rt does a release, so do a hold here */
+		fib6_info_hold(f6i);
+		ipv6_stub->ip6_del_rt(net, f6i);
+	}
 }
 
 static void __remove_nexthop(struct net *net, struct nexthop *nh,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6b673d4f5ca9..7549e779335d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2421,6 +2421,10 @@  static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
+		/* prefix routes only use builtin fib6_nh */
+		if (rt->nh)
+			continue;
+
 		if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
 			continue;
 		if (no_gw && rt->fib6_nh->fib_nh_gw_family)
@@ -6354,6 +6358,7 @@  void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 	list_for_each_entry(ifa, &idev->addr_list, if_list) {
 		spin_lock(&ifa->lock);
 		if (ifa->rt) {
+			/* host routes only use builtin fib6_nh */
 			struct fib6_nh *nh = ifa->rt->fib6_nh;
 			int cpu;
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index cdfb8500ccae..02feda73a98e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -159,6 +159,7 @@  struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
 	if (!f6i)
 		return NULL;
 
+	/* fib6_siblings is a union with nh_list, so this initializes both */
 	INIT_LIST_HEAD(&f6i->fib6_siblings);
 	refcount_set(&f6i->fib6_ref, 1);
 
@@ -171,7 +172,11 @@  void fib6_info_destroy_rcu(struct rcu_head *head)
 
 	WARN_ON(f6i->fib6_node);
 
-	fib6_nh_release(f6i->fib6_nh);
+	if (f6i->nh)
+		nexthop_put(f6i->nh);
+	else
+		fib6_nh_release(f6i->fib6_nh);
+
 	ip_fib_metrics_put(f6i->fib6_metrics);
 	kfree(f6i);
 }
@@ -927,6 +932,9 @@  static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 
 	fib6_drop_pcpu_from(rt, table);
 
+	if (rt->nh && !list_empty(&rt->nh_list))
+		list_del_init(&rt->nh_list);
+
 	if (refcount_read(&rt->fib6_ref) != 1) {
 		/* This route is used as dummy address holder in some split
 		 * nodes. It is not leaked, but it still holds other resources,
@@ -1334,6 +1342,8 @@  int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 
 	err = fib6_add_rt2node(fn, rt, info, extack);
 	if (!err) {
+		if (rt->nh)
+			list_add(&rt->nh_list, &rt->nh->f6i_list);
 		__fib6_update_sernum_upto_root(rt, sernum);
 		fib6_start_gc(info->nl_net, rt);
 	}
@@ -2295,9 +2305,13 @@  static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
 	struct fib6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
+	struct fib6_nh *fib6_nh = rt->fib6_nh;
 	unsigned int flags = rt->fib6_flags;
 	const struct net_device *dev;
 
+	if (rt->nh)
+		fib6_nh = nexthop_fib6_nh(rt->nh);
+
 	seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -2305,14 +2319,14 @@  static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 #else
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
-	if (rt->fib6_nh->fib_nh_gw_family) {
+	if (fib6_nh->fib_nh_gw_family) {
 		flags |= RTF_GATEWAY;
-		seq_printf(seq, "%pi6", &rt->fib6_nh->fib_nh_gw6);
+		seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
 	} else {
 		seq_puts(seq, "00000000000000000000000000000000");
 	}
 
-	dev = rt->fib6_nh->fib_nh_dev;
+	dev = fib6_nh->fib_nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
 		   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
 		   flags, dev ? dev->name : "");
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f874dde1ee85..6e3c51109c83 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1289,9 +1289,8 @@  static void ndisc_router_discovery(struct sk_buff *skb)
 	    !in6_dev->cnf.accept_ra_rtr_pref)
 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
 #endif
-
+	/* routes added from RAs do not use nexthop objects */
 	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
-
 	if (rt) {
 		neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
 					 rt->fib6_nh->fib_nh_dev, NULL,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fada5a13bcb2..51cb5cb027ae 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -432,15 +432,21 @@  void fib6_select_path(const struct net *net, struct fib6_result *res,
 	struct fib6_info *sibling, *next_sibling;
 	struct fib6_info *match = res->f6i;
 
-	if (!match->fib6_nsiblings || have_oif_match)
+	if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
 		goto out;
 
 	/* We might have already computed the hash for ICMPv6 errors. In such
 	 * case it will always be non-zero. Otherwise now is the time to do it.
 	 */
-	if (!fl6->mp_hash)
+	if (!fl6->mp_hash &&
+	    (!match->nh || nexthop_is_multipath(match->nh)))
 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
+	if (unlikely(match->nh)) {
+		nexthop_path_fib6_result(res, fl6->mp_hash);
+		return;
+	}
+
 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
 		goto out;
 
@@ -496,7 +502,13 @@  static void rt6_device_match(struct net *net, struct fib6_result *res,
 	struct fib6_nh *nh;
 
 	if (!oif && ipv6_addr_any(saddr)) {
-		nh = f6i->fib6_nh;
+		if (unlikely(f6i->nh)) {
+			nh = nexthop_fib6_nh(f6i->nh);
+			if (nexthop_is_blackhole(f6i->nh))
+				goto out_blackhole;
+		} else {
+			nh = f6i->fib6_nh;
+		}
 		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
 			goto out;
 	}
@@ -515,7 +527,14 @@  static void rt6_device_match(struct net *net, struct fib6_result *res,
 		goto out;
 	}
 
-	nh = f6i->fib6_nh;
+	if (unlikely(f6i->nh)) {
+		nh = nexthop_fib6_nh(f6i->nh);
+		if (nexthop_is_blackhole(f6i->nh))
+			goto out_blackhole;
+	} else {
+		nh = f6i->fib6_nh;
+	}
+
 	if (nh->fib_nh_flags & RTNH_F_DEAD) {
 		res->f6i = net->ipv6.fib6_null_entry;
 		nh = res->f6i->fib6_nh;
@@ -524,6 +543,12 @@  static void rt6_device_match(struct net *net, struct fib6_result *res,
 	res->nh = nh;
 	res->fib6_type = res->f6i->fib6_type;
 	res->fib6_flags = res->f6i->fib6_flags;
+	return;
+
+out_blackhole:
+	res->fib6_flags |= RTF_REJECT;
+	res->fib6_type = RTN_BLACKHOLE;
+	res->nh = nh;
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -1117,6 +1142,8 @@  static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		rt = net->ipv6.ip6_null_entry;
 		dst_hold(&rt->dst);
 		goto out;
+	} else if (res.fib6_flags & RTF_REJECT) {
+		goto do_create;
 	}
 
 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
@@ -1128,6 +1155,7 @@  static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		if (ip6_hold_safe(net, &rt))
 			dst_use_noref(&rt->dst, jiffies);
 	} else {
+do_create:
 		rt = ip6_create_rt_rcu(&res);
 	}
 
@@ -1982,6 +2010,14 @@  struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
 		return rt;
+	} else if (res.fib6_flags & RTF_REJECT) {
+		rt = ip6_create_rt_rcu(&res);
+		rcu_read_unlock();
+		if (!rt) {
+			rt = net->ipv6.ip6_null_entry;
+			dst_hold(&rt->dst);
+		}
+		return rt;
 	}
 
 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
@@ -3217,7 +3253,9 @@  static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 {
 	struct net *net = cfg->fc_nlinfo.nl_net;
 	struct fib6_info *rt = NULL;
+	struct nexthop *nh = NULL;
 	struct fib6_table *table;
+	struct fib6_nh *fib6_nh;
 	int err = -EINVAL;
 	int addr_type;
 
@@ -3270,7 +3308,7 @@  static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 
 	err = -ENOMEM;
-	rt = fib6_info_alloc(gfp_flags, true);
+	rt = fib6_info_alloc(gfp_flags, !nh);
 	if (!rt)
 		goto out;
 
@@ -3310,19 +3348,35 @@  static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
 	rt->fib6_src.plen = cfg->fc_src_len;
 #endif
-	err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
-	if (err)
-		goto out;
+	if (nh) {
+		if (!nexthop_get(nh)) {
+			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+			goto out;
+		}
+		if (rt->fib6_src.plen) {
+			NL_SET_ERR_MSG(extack, "Nexthops can not be used wtih source routing");
+			goto out;
+		}
+		rt->nh = nh;
+		fib6_nh = nexthop_fib6_nh(rt->nh);
+	} else {
+		err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
+		if (err)
+			goto out;
 
-	/* We cannot add true routes via loopback here,
-	 * they would result in kernel looping; promote them to reject routes
-	 */
-	addr_type = ipv6_addr_type(&cfg->fc_dst);
-	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type))
-		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+		fib6_nh = rt->fib6_nh;
+
+		/* We cannot add true routes via loopback here, they would
+		 * result in kernel looping; promote them to reject routes
+		 */
+		addr_type = ipv6_addr_type(&cfg->fc_dst);
+		if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
+				   addr_type))
+			rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+	}
 
 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
-		struct net_device *dev = fib6_info_nh_dev(rt);
+		struct net_device *dev = fib6_nh->fib_nh_dev;
 
 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
 			NL_SET_ERR_MSG(extack, "Invalid source address");
@@ -3678,6 +3732,9 @@  static struct fib6_info *rt6_get_route_info(struct net *net,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
+		/* these routes do not use nexthops */
+		if (rt->nh)
+			continue;
 		if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
 			continue;
 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
@@ -3741,8 +3798,13 @@  struct fib6_info *rt6_get_dflt_router(struct net *net,
 
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
-		struct fib6_nh *nh = rt->fib6_nh;
+		struct fib6_nh *nh;
+
+		/* RA routes do not use nexthops */
+		if (rt->nh)
+			continue;
 
+		nh = rt->fib6_nh;
 		if (dev == nh->fib_nh_dev &&
 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
@@ -3993,7 +4055,8 @@  static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
 
-	if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
+	if (!rt->nh &&
+	    ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
 	    rt != net->ipv6.fib6_null_entry &&
 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
@@ -4021,8 +4084,13 @@  void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
-	struct fib6_nh *nh = rt->fib6_nh;
+	struct fib6_nh *nh;
 
+	/* RA routes do not use nexthops */
+	if (rt->nh)
+		return 0;
+
+	nh = rt->fib6_nh;
 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
 	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
 		return -1;
@@ -4069,6 +4137,7 @@  static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 	return NULL;
 }
 
+/* only called for fib entries with builtin fib6_nh */
 static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
@@ -4147,7 +4216,7 @@  static int fib6_ifup(struct fib6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.fib6_null_entry &&
+	if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
 	    rt->fib6_nh->fib_nh_dev == arg->dev) {
 		rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
@@ -4172,6 +4241,7 @@  void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
 }
 
+/* only called for fib entries with inline fib6_nh */
 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
 				   const struct net_device *dev)
 {
@@ -4232,7 +4302,7 @@  static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 	const struct net_device *dev = arg->dev;
 	struct net *net = dev_net(dev);
 
-	if (rt == net->ipv6.fib6_null_entry)
+	if (rt == net->ipv6.fib6_null_entry || rt->nh)
 		return 0;
 
 	switch (arg->event) {
@@ -4786,6 +4856,9 @@  static size_t rt6_nlmsg_size(struct fib6_info *rt)
 {
 	int nexthop_len = 0;
 
+	if (rt->nh)
+		nexthop_len += nla_total_size(4); /* RTA_NH_ID */
+
 	if (rt->fib6_nsiblings) {
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
 			    + NLA_ALIGN(sizeof(struct rtnexthop))
@@ -4812,6 +4885,35 @@  static size_t rt6_nlmsg_size(struct fib6_info *rt)
 	       + nexthop_len;
 }
 
+static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
+				 unsigned char *flags)
+{
+	if (nexthop_is_multipath(nh)) {
+		struct nlattr *mp;
+
+		mp = nla_nest_start(skb, RTA_MULTIPATH);
+		if (!mp)
+			goto nla_put_failure;
+
+		if (nexthop_mpath_fill_node(skb, nh))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, mp);
+	} else {
+		struct fib6_nh *fib6_nh;
+
+		fib6_nh = nexthop_fib6_nh(nh);
+		if (fib_nexthop_info(skb, &fib6_nh->nh_common,
+				     flags, false) < 0)
+			goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			 struct fib6_info *rt, struct dst_entry *dst,
 			 struct in6_addr *dest, struct in6_addr *src,
@@ -4821,6 +4923,7 @@  static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	struct rt6_info *rt6 = (struct rt6_info *)dst;
 	struct rt6key *rt6_dst, *rt6_src;
 	u32 *pmetrics, table, rt6_flags;
+	unsigned char nh_flags = 0;
 	struct nlmsghdr *nlh;
 	struct rtmsg *rtm;
 	long expires = 0;
@@ -4940,9 +5043,18 @@  static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 		}
 
 		nla_nest_end(skb, mp);
-	} else {
-		unsigned char nh_flags = 0;
+	} else if (rt->nh) {
+		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
+			goto nla_put_failure;
+
+		if (nexthop_is_blackhole(rt->nh))
+			rtm->rtm_type = RTN_BLACKHOLE;
 
+		if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+			goto nla_put_failure;
+
+		rtm->rtm_flags |= nh_flags;
+	} else {
 		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
 				     &nh_flags, false) < 0)
 			goto nla_put_failure;