diff mbox series

[ovs-dev,v1,1/6] netlink: Support accessing namespaces.

Message ID 673527edfc89645d393f6669460c775e45a98391.1729784574.git.felix.huettner@stackit.cloud
State Superseded
Delegated to: Eelco Chaudron
Headers show
Series OVS Patches for OVN Fabric Integration | expand

Checks

Context Check Description
ovsrobot/apply-robot warning apply and check: warning
ovsrobot/github-robot-_Build_and_Test fail github build: failed

Commit Message

Felix Huettner Oct. 24, 2024, 3:45 p.m. UTC
In the future we need to access routing table entries in network
namespaces.
As netlink has no easy way to get a socket in a network namespace we
copy the logic of frr where we:
1. enter the target netns
2. open a socket
3. jump back to our original netns

the socket will keep working between these jumps.

Signed-off-by: Felix Huettner <felix.huettner@stackit.cloud>
---
 lib/dpif-netlink-rtnl.c |   6 +--
 lib/dpif-netlink.c      |  22 ++++----
 lib/netdev-linux.c      |   6 +--
 lib/netlink-conntrack.c |  20 ++++----
 lib/netlink-socket.c    | 110 ++++++++++++++++++++++++++++++++--------
 lib/netlink-socket.h    |   7 +--
 lib/route-table.c       |   6 +--
 lib/tc.c                |   8 +--
 8 files changed, 127 insertions(+), 58 deletions(-)

Comments

Eelco Chaudron Nov. 14, 2024, 12:26 p.m. UTC | #1
On 24 Oct 2024, at 17:45, Felix Huettner via dev wrote:

> In the future we need to access routing table entries in network
> namespaces.
> As netlink has no easy way to get a socket in a network namespace we
> copy the logic of frr where we:
> 1. enter the target netns
> 2. open a socket
> 3. jump back to our original netns
>
> the socket will keep working between these jumps.

Hi Felix,

Thanks for the patches. I did review the series, but did not do any actual testing. It would be good to add some test cases, and add NEWS section.

On this patch; Given the number of changes and the fact that netns is ultimately used only in three instances, I’d recommend adding a nl_ns_transact() and nl_ns_dump_start() instead.

Some additional comments below.

Eelco

> Signed-off-by: Felix Huettner <felix.huettner@stackit.cloud>
> ---
>  lib/dpif-netlink-rtnl.c |   6 +--
>  lib/dpif-netlink.c      |  22 ++++----
>  lib/netdev-linux.c      |   6 +--
>  lib/netlink-conntrack.c |  20 ++++----
>  lib/netlink-socket.c    | 110 ++++++++++++++++++++++++++++++++--------
>  lib/netlink-socket.h    |   7 +--
>  lib/route-table.c       |   6 +--
>  lib/tc.c                |   8 +--
>  8 files changed, 127 insertions(+), 58 deletions(-)
>
> diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c
> index f7035333e..2df905d92 100644
> --- a/lib/dpif-netlink-rtnl.c
> +++ b/lib/dpif-netlink-rtnl.c
> @@ -158,7 +158,7 @@ rtnl_transact(uint32_t type, uint32_t flags, const char *name,
>      ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
>      nl_msg_put_string(&request, IFLA_IFNAME, name);
>
> -    err = nl_transact(NETLINK_ROUTE, &request, reply);
> +    err = nl_transact(NULL, NETLINK_ROUTE, &request, reply);
>      ofpbuf_uninit(&request);
>
>      return err;
> @@ -342,7 +342,7 @@ rtnl_set_mtu(const char *name, uint32_t mtu, struct ofpbuf *request)
>      nl_msg_put_string(request, IFLA_IFNAME, name);
>      nl_msg_put_u32(request, IFLA_MTU, mtu);
>
> -    return nl_transact(NETLINK_ROUTE, request, NULL);
> +    return nl_transact(NULL, NETLINK_ROUTE, request, NULL);
>  }
>
>  static int
> @@ -425,7 +425,7 @@ dpif_netlink_rtnl_create(const struct netdev_tunnel_config *tnl_cfg,
>      nl_msg_end_nested(&request, infodata_off);
>      nl_msg_end_nested(&request, linkinfo_off);
>
> -    err = nl_transact(NETLINK_ROUTE, &request, NULL);
> +    err = nl_transact(NULL, NETLINK_ROUTE, &request, NULL);
>      if (!err && (type == OVS_VPORT_TYPE_GRE ||
>                   type == OVS_VPORT_TYPE_IP6GRE)) {
>          /* Work around a bug in kernel GRE driver, which ignores IFLA_MTU in
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 84e2bd8ea..3e4e03697 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -1348,7 +1348,7 @@ dpif_netlink_port_dump_start__(const struct dpif_netlink *dpif,
>
>      buf = ofpbuf_new(1024);
>      dpif_netlink_vport_to_ofpbuf(&request, buf);
> -    nl_dump_start(dump, NETLINK_GENERIC, buf);
> +    nl_dump_start(NULL, dump, NETLINK_GENERIC, buf);
>      ofpbuf_delete(buf);
>  }
>
> @@ -1682,7 +1682,7 @@ dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse,
>
>          buf = ofpbuf_new(1024);
>          dpif_netlink_flow_to_ofpbuf(&request, buf);
> -        nl_dump_start(&dump->nl_dump, NETLINK_GENERIC, buf);
> +        nl_dump_start(NULL, &dump->nl_dump, NETLINK_GENERIC, buf);
>          ofpbuf_delete(buf);
>      }
>      atomic_init(&dump->status, 0);
> @@ -2141,7 +2141,7 @@ dpif_netlink_operate__(struct dpif_netlink *dpif,
>      for (i = 0; i < n_ops; i++) {
>          txnsp[i] = &auxes[i].txn;
>      }
> -    nl_transact_multiple(NETLINK_GENERIC, txnsp, n_ops);
> +    nl_transact_multiple(NULL, NETLINK_GENERIC, txnsp, n_ops);
>
>      for (i = 0; i < n_ops; i++) {
>          struct op_auxdata *aux = &auxes[i];
> @@ -3391,7 +3391,7 @@ dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED,
>      }
>      nl_msg_end_nested(request, opt_offset);
>
> -    int err = nl_transact(NETLINK_GENERIC, request, NULL);
> +    int err = nl_transact(NULL, NETLINK_GENERIC, request, NULL);
>      ofpbuf_delete(request);
>      return err;
>  }
> @@ -3476,7 +3476,7 @@ dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED,
>      }
>
>      struct ofpbuf *reply;
> -    int err = nl_transact(NETLINK_GENERIC, request, &reply);
> +    int err = nl_transact(NULL, NETLINK_GENERIC, request, &reply);
>      if (err) {
>          goto out;
>      }
> @@ -3520,7 +3520,7 @@ dpif_netlink_ct_del_limits(struct dpif *dpif OVS_UNUSED,
>          nl_msg_end_nested(request, opt_offset);
>      }
>
> -    int err = nl_transact(NETLINK_GENERIC, request, NULL);
> +    int err = nl_transact(NULL, NETLINK_GENERIC, request, NULL);
>
>      ofpbuf_delete(request);
>      return err;
> @@ -4066,7 +4066,7 @@ dpif_netlink_meter_transact(struct ofpbuf *request, struct ofpbuf **replyp,
>                              const struct nl_policy *reply_policy,
>                              struct nlattr **a, size_t size_a)
>  {
> -    int error = nl_transact(NETLINK_GENERIC, request, replyp);
> +    int error = nl_transact(NULL, NETLINK_GENERIC, request, replyp);
>      ofpbuf_uninit(request);
>
>      if (error) {
> @@ -4818,7 +4818,7 @@ dpif_netlink_vport_transact(const struct dpif_netlink_vport *request,
>
>      request_buf = ofpbuf_new(1024);
>      dpif_netlink_vport_to_ofpbuf(request, request_buf);
> -    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
> +    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
>      ofpbuf_delete(request_buf);
>
>      if (reply) {
> @@ -4969,7 +4969,7 @@ dpif_netlink_dp_dump_start(struct nl_dump *dump)
>
>      buf = ofpbuf_new(1024);
>      dpif_netlink_dp_to_ofpbuf(&request, buf);
> -    nl_dump_start(dump, NETLINK_GENERIC, buf);
> +    nl_dump_start(NULL, dump, NETLINK_GENERIC, buf);
>      ofpbuf_delete(buf);
>  }
>
> @@ -4990,7 +4990,7 @@ dpif_netlink_dp_transact(const struct dpif_netlink_dp *request,
>
>      request_buf = ofpbuf_new(1024);
>      dpif_netlink_dp_to_ofpbuf(request, request_buf);
> -    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
> +    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
>      ofpbuf_delete(request_buf);
>
>      if (reply) {
> @@ -5223,7 +5223,7 @@ dpif_netlink_flow_transact(struct dpif_netlink_flow *request,
>
>      request_buf = ofpbuf_new(1024);
>      dpif_netlink_flow_to_ofpbuf(request, request_buf);
> -    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
> +    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
>      ofpbuf_delete(request_buf);
>
>      if (reply) {
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 0cd0850a3..2353ee1a2 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -3358,7 +3358,7 @@ start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
>          return false;
>      }
>      tcmsg->tcm_parent = 0;
> -    nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
> +    nl_dump_start(NULL, &state->dump, NETLINK_ROUTE, &request);
>      ofpbuf_uninit(&request);
>
>      ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
> @@ -6722,7 +6722,7 @@ get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
>                          RTM_GETLINK, NLM_F_REQUEST);
>      ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
>      nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
> -    error = nl_transact(NETLINK_ROUTE, &request, &reply);
> +    error = nl_transact(NULL, NETLINK_ROUTE, &request, &reply);
>      ofpbuf_uninit(&request);
>      if (error) {
>          return error;
> @@ -6858,7 +6858,7 @@ netdev_linux_update_via_netlink(struct netdev_linux *netdev)
>
>      nl_msg_put_u32(&request, IFLA_EXT_MASK, RTEXT_FILTER_SKIP_STATS);
>
> -    error = nl_transact(NETLINK_ROUTE, &request, &reply);
> +    error = nl_transact(NULL, NETLINK_ROUTE, &request, &reply);
>      ofpbuf_uninit(&request);
>      if (error) {
>          ofpbuf_delete(reply);
> diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c
> index 0b3a8adf5..79ca17ec9 100644
> --- a/lib/netlink-conntrack.c
> +++ b/lib/netlink-conntrack.c
> @@ -144,7 +144,7 @@ nl_ct_dump_start(struct nl_ct_dump_state **statep, const uint16_t *zone,
>      if (zone) {
>          nl_msg_put_be16(&state->buf, CTA_ZONE, htons(*zone));
>      }
> -    nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf);
> +    nl_dump_start(NULL, &state->dump, NETLINK_NETFILTER, &state->buf);
>      ofpbuf_clear(&state->buf);
>
>      /* Buckets to store connections are not used. */
> @@ -236,7 +236,7 @@ nl_ct_flush(void)
>      nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
>                          IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
>
> -    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> +    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
>      ofpbuf_uninit(&buf);
>
>      /* Expectations are flushed automatically, because they do not
> @@ -260,7 +260,7 @@ nl_ct_flush_tuple(const struct ct_dpif_tuple *tuple, uint16_t zone)
>          err = EOPNOTSUPP;
>          goto out;
>      }
> -    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> +    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
>  out:
>      ofpbuf_uninit(&buf);
>      return err;
> @@ -278,7 +278,7 @@ nl_ct_flush_zone_with_cta_zone(uint16_t flush_zone)
>                          IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
>      nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone));
>
> -    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> +    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
>      ofpbuf_uninit(&buf);
>
>      return err;
> @@ -340,7 +340,7 @@ nl_ct_flush_zone(uint16_t flush_zone)
>      nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
>                          IPCTNL_MSG_CT_GET, NLM_F_REQUEST);
>      nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone));
> -    nl_dump_start(&dump, NETLINK_NETFILTER, &buf);
> +    nl_dump_start(NULL, &dump, NETLINK_NETFILTER, &buf);
>      ofpbuf_clear(&buf);
>
>      for (;;) {
> @@ -374,7 +374,7 @@ nl_ct_flush_zone(uint16_t flush_zone)
>                            attrs[CTA_TUPLE_ORIG]->nla_len - NLA_HDRLEN);
>          nl_msg_put_unspec(&delete, CTA_ID, attrs[CTA_ID] + 1,
>                            attrs[CTA_ID]->nla_len - NLA_HDRLEN);
> -        nl_transact(NETLINK_NETFILTER, &delete, NULL);
> +        nl_transact(NULL, NETLINK_NETFILTER, &delete, NULL);
>          ofpbuf_clear(&delete);
>      }
>
> @@ -1101,7 +1101,7 @@ nl_ct_set_timeout_policy(const struct nl_ct_timeout_policy *nl_tp)
>      }
>      nl_msg_end_nested(&buf, offset);
>
> -    int err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> +    int err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
>      ofpbuf_uninit(&buf);
>      return err;
>  }
> @@ -1116,7 +1116,7 @@ nl_ct_get_timeout_policy(const char *tp_name,
>      nl_msg_put_nfgenmsg(&request, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK_TIMEOUT,
>                          IPCTNL_MSG_TIMEOUT_GET, NLM_F_REQUEST | NLM_F_ACK);
>      nl_msg_put_string(&request, CTA_TIMEOUT_NAME, tp_name);
> -    int err = nl_transact(NETLINK_NETFILTER, &request, &reply);
> +    int err = nl_transact(NULL, NETLINK_NETFILTER, &request, &reply);
>      if (err) {
>          goto out;
>      }
> @@ -1139,7 +1139,7 @@ nl_ct_del_timeout_policy(const char *tp_name)
>                          IPCTNL_MSG_TIMEOUT_DELETE, NLM_F_REQUEST | NLM_F_ACK);
>
>      nl_msg_put_string(&buf, CTA_TIMEOUT_NAME, tp_name);
> -    int err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> +    int err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
>      ofpbuf_uninit(&buf);
>      return err;
>  }
> @@ -1162,7 +1162,7 @@ nl_ct_timeout_policy_dump_start(
>                          IPCTNL_MSG_TIMEOUT_GET,
>                          NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP);
>
> -    nl_dump_start(&state->dump, NETLINK_NETFILTER, &request);
> +    nl_dump_start(NULL, &state->dump, NETLINK_NETFILTER, &request);
>      ofpbuf_uninit(&request);
>      ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
>      return 0;
> diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
> index 976ed15e8..801b4badd 100644
> --- a/lib/netlink-socket.c
> +++ b/lib/netlink-socket.c
> @@ -17,6 +17,7 @@
>  #include <config.h>
>  #include "netlink-socket.h"
>  #include <errno.h>
> +#include <fcntl.h>
>  #include <inttypes.h>
>  #include <stdlib.h>
>  #include <sys/socket.h>
> @@ -25,6 +26,7 @@
>  #include <unistd.h>
>  #include "coverage.h"
>  #include "openvswitch/dynamic-string.h"
> +#include "openvswitch/shash.h"
>  #include "hash.h"
>  #include "openvswitch/hmap.h"
>  #include "netlink.h"
> @@ -94,6 +96,7 @@ struct nl_sock {
>      uint32_t pid;
>      int protocol;
>      unsigned int rcvbuf;        /* Receive buffer size (SO_RCVBUF). */
> +    char *netns;
>  };
>
>  /* Compile-time limit on iovecs, so that we can allocate a maximum-size array
> @@ -106,7 +109,7 @@ struct nl_sock {
>   * Initialized by nl_sock_create(). */
>  static int max_iovs;
>
> -static int nl_pool_alloc(int protocol, struct nl_sock **sockp);
> +static int nl_pool_alloc(const char *, int protocol, struct nl_sock **sockp);
>  static void nl_pool_release(struct nl_sock *);
>
>  /* Creates a new netlink socket for the given netlink 'protocol'
> @@ -145,6 +148,7 @@ nl_sock_create(int protocol, struct nl_sock **sockp)
>
>      *sockp = NULL;
>      sock = xmalloc(sizeof *sock);
> +    sock->netns = NULL;
>
>  #ifdef _WIN32
>      sock->overlapped.hEvent = NULL;
> @@ -294,6 +298,7 @@ nl_sock_destroy(struct nl_sock *sock)
>  #else
>          close(sock->fd);
>  #endif
> +        free(sock->netns);
>          free(sock);
>      }
>  }
> @@ -1152,6 +1157,9 @@ nl_sock_drain(struct nl_sock *sock)
>   * Netlink socket created with the given 'protocol', and initializes 'dump' to
>   * reflect the state of the operation.
>   *
> + * The dump runs in the specified 'netns'. If 'netns' is NULL the current will
> + * be used.
> + *
>   * 'request' must contain a Netlink message.  Before sending the message,
>   * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
>   * set to the Netlink socket's pid.  NLM_F_DUMP and NLM_F_ACK will be set in
> @@ -1165,13 +1173,14 @@ nl_sock_drain(struct nl_sock *sock)
>   * The caller must eventually destroy 'request'.
>   */
>  void
> -nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
> +nl_dump_start(const char *netns, struct nl_dump *dump, int protocol,
> +              const struct ofpbuf *request)
>  {
>      nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
>
>      ovs_mutex_init(&dump->mutex);
>      ovs_mutex_lock(&dump->mutex);
> -    dump->status = nl_pool_alloc(protocol, &dump->sock);
> +    dump->status = nl_pool_alloc(netns, protocol, &dump->sock);
>      if (!dump->status) {
>          dump->status = nl_sock_send__(dump->sock, request,
>                                        nl_sock_allocate_seq(dump->sock, 1),
> @@ -1725,39 +1734,95 @@ struct nl_pool {
>      int n;
>  };
>
> +struct nl_ns_pool {
> +    struct nl_pool pools[MAX_LINKS];
> +};
> +
>  static struct ovs_mutex pool_mutex = OVS_MUTEX_INITIALIZER;
> -static struct nl_pool pools[MAX_LINKS] OVS_GUARDED_BY(pool_mutex);
> +static struct shash pools OVS_GUARDED_BY(pool_mutex) =
> +                                SHASH_INITIALIZER(&pools);
>
>  static int
> -nl_pool_alloc(int protocol, struct nl_sock **sockp)
> +nl_sock_ns_create(const char *netns, int protocol, struct nl_sock **sockp) {
> +    int ret, ns_fd, ns_default_fd, err;
> +    if (netns) {

How will this all work on BSD and Windows? Guess we need a proper return EOPNOTSUPP or something?

> +        ns_default_fd = open("/proc/self/ns/net", O_RDONLY);
> +        if (ns_default_fd < 0) {
> +            VLOG_ERR("something wrong when opening self net fd, %d\n",
> +                     errno);
> +            return -errno;
> +        }
> +        char *netns_path = xasprintf("/var/run/netns/%s", netns);
> +        ns_fd = open(netns_path, O_RDONLY);
> +        free(netns_path);
> +        if (ns_fd < 0) {
> +            VLOG_ERR("something wrong when opening other net fd, %d\n",
> +                     errno);

Should we include netns_path in the error log message?

Should we close ns_default_fd ?

> +            return -errno;
> +        }
> +        err = setns(ns_fd, CLONE_NEWNET);
> +        if (err < 0) {
> +            VLOG_ABORT("something wrong during setns to target, %d\n",
> +                       errno);

And abort is crashing OVS/OVN, are we sure we want to do this? What if someone decides to remove a netns, it might crash OVS.

> +        }
> +        close(ns_fd);
> +    }
> +    ret = nl_sock_create(protocol, sockp);
> +    if (netns) {
> +        err = setns(ns_default_fd, CLONE_NEWNET);
> +        if (err < 0) {
> +            VLOG_ABORT("something wrong during setns to home, %d\n",
> +                       errno);
> +        }
> +        close(ns_default_fd);
> +        if (*sockp) {
> +            (*sockp)->netns = xstrdup(netns);
> +        }
> +    }
> +    return ret;
> +}
> +
> +static int
> +nl_pool_alloc(const char *netns, int protocol, struct nl_sock **sockp)
>  {
>      struct nl_sock *sock = NULL;
>      struct nl_pool *pool;
>
> -    ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(pools));
> +    ovs_assert(protocol >= 0 && protocol < MAX_LINKS);

Maybe ARRAY_SIZE(ns_pools->pools)

>
>      ovs_mutex_lock(&pool_mutex);
> -    pool = &pools[protocol];
> -    if (pool->n > 0) {
> -        sock = pool->socks[--pool->n];
> +    const char * netns_name = netns ? netns : "";

Can you move this to the top of the function, with the other definitions?

> +
> +    struct nl_ns_pool *ns_pools = shash_find_data(&pools, netns_name);
> +    if (ns_pools) {
> +        pool = &ns_pools->pools[protocol];
> +        if (pool->n > 0) {
> +            sock = pool->socks[--pool->n];
> +        }
> +
> +        if (sock) {
> +            ovs_mutex_unlock(&pool_mutex);
> +            *sockp = sock;
> +            return 0;
> +        }
>      }
>      ovs_mutex_unlock(&pool_mutex);

Can we re-arrange the function a bit so we only have a single unlock()?
Something like (not even compile tested):

static int
nl_pool_alloc(const char *netns, int protocol, struct nl_sock **sockp)
{
    const char * netns_name = netns ? netns : "";
    struct nl_ns_pool *ns_pools;
    struct nl_sock *sock = NULL;
    struct nl_pool *pool;

    ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(ns_pools->pools));

    ovs_mutex_lock(&pool_mutex);
    ns_pools = shash_find_data(&pools, netns_name);
    if (ns_pools) {
        pool = &ns_pools->pools[protocol];
        if (pool->n > 0) {
            sock = pool->socks[--pool->n];
        }
    }
    ovs_mutex_unlock(&pool_mutex);

    if (sock) {
        *sockp = sock;
        return 0;
    }
    return nl_sock_ns_create(netns, protocol, sockp);
}

> -
> -    if (sock) {
> -        *sockp = sock;
> -        return 0;
> -    } else {
> -        return nl_sock_create(protocol, sockp);
> -    }
> +    return nl_sock_ns_create(netns, protocol, sockp);
>  }
>
>  static void
>  nl_pool_release(struct nl_sock *sock)
>  {
>      if (sock) {
> -        struct nl_pool *pool = &pools[sock->protocol];
> +        const char * netns_name = sock->netns ? sock->netns : "";
>
>          ovs_mutex_lock(&pool_mutex);
> +        struct nl_ns_pool *ns_pools = shash_find_data(&pools, netns_name);
> +        if (!ns_pools) {
> +            ns_pools = xzalloc(sizeof(*ns_pools));
> +            shash_add(&pools, netns_name, ns_pools);
> +        }
> +        struct nl_pool *pool = &ns_pools->pools[sock->protocol];
>          if (pool->n < ARRAY_SIZE(pool->socks)) {
>              pool->socks[pool->n++] = sock;
>              sock = NULL;
> @@ -1772,6 +1837,9 @@ nl_pool_release(struct nl_sock *sock)
>   * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response.  If
>   * successful, returns 0.  On failure, returns a positive errno value.
>   *
> + * The 'request' is send in the specified netns. If 'netns' is NULL the current
> + * namespace is used.
> + *
>   * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
>   * reply, which the caller is responsible for freeing with ofpbuf_delete(), and
>   * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the kernel's
> @@ -1810,13 +1878,13 @@ nl_pool_release(struct nl_sock *sock)
>   *         needs to be idempotent.
>   */
>  int
> -nl_transact(int protocol, const struct ofpbuf *request,
> +nl_transact(const char *netns, int protocol, const struct ofpbuf *request,
>              struct ofpbuf **replyp)
>  {
>      struct nl_sock *sock;
>      int error;
>
> -    error = nl_pool_alloc(protocol, &sock);
> +    error = nl_pool_alloc(netns, protocol, &sock);
>      if (error) {
>          if (replyp) {
>              *replyp = NULL;
> @@ -1851,13 +1919,13 @@ nl_transact(int protocol, const struct ofpbuf *request,
>   * nl_transact() for some caveats.
>   */
>  void
> -nl_transact_multiple(int protocol,
> +nl_transact_multiple(const char *netns, int protocol,
>                       struct nl_transaction **transactions, size_t n)
>  {
>      struct nl_sock *sock;
>      int error;
>
> -    error = nl_pool_alloc(protocol, &sock);
> +    error = nl_pool_alloc(netns, protocol, &sock);
>      if (!error) {
>          nl_sock_transact_multiple(sock, transactions, n);
>          nl_pool_release(sock);
> diff --git a/lib/netlink-socket.h b/lib/netlink-socket.h
> index 7852ad052..e4d645a62 100644
> --- a/lib/netlink-socket.h
> +++ b/lib/netlink-socket.h
> @@ -251,9 +251,10 @@ struct nl_transaction {
>  };
>
>  /* Transactions without an allocated socket. */
> -int nl_transact(int protocol, const struct ofpbuf *request,
> +int nl_transact(const char *netns, int protocol, const struct ofpbuf *request,
>                  struct ofpbuf **replyp);
> -void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n);
> +void nl_transact_multiple(const char *netns, int protocol,
> +                          struct nl_transaction **, size_t n);
>
>  /* Table dumping. */
>  #define NL_DUMP_BUFSIZE         4096
> @@ -270,7 +271,7 @@ struct nl_dump {
>      struct ovs_mutex mutex;     /* Protects 'status', synchronizes recv(). */
>  };
>
> -void nl_dump_start(struct nl_dump *, int protocol,
> +void nl_dump_start(const char *netns, struct nl_dump *dump, int protocol,
>                     const struct ofpbuf *request);
>  bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf);
>  int nl_dump_done(struct nl_dump *);
> diff --git a/lib/route-table.c b/lib/route-table.c
> index c6cb21394..d5d98585e 100644
> --- a/lib/route-table.c
> +++ b/lib/route-table.c
> @@ -156,7 +156,7 @@ route_table_wait(void)
>  }
>
>  static bool
> -route_table_dump_one_table(unsigned char id)
> +route_table_dump_one_table(const char *netns, unsigned char id)
>  {
>      uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
>      struct ofpbuf request, reply, buf;
> @@ -172,7 +172,7 @@ route_table_dump_one_table(unsigned char id)
>      rq_msg->rtm_family = AF_UNSPEC;
>      rq_msg->rtm_table = id;
>
> -    nl_dump_start(&dump, NETLINK_ROUTE, &request);
> +    nl_dump_start(netns, &dump, NETLINK_ROUTE, &request);
>      ofpbuf_uninit(&request);
>
>      ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
> @@ -212,7 +212,7 @@ route_table_reset(void)
>      COVERAGE_INC(route_table_dump);
>
>      for (size_t i = 0; i < ARRAY_SIZE(tables); i++) {
> -        if (!route_table_dump_one_table(tables[i])) {
> +        if (!route_table_dump_one_table(NULL, tables[i])) {
>              /* Got unfiltered reply, no need to dump further. */
>              break;
>          }
> diff --git a/lib/tc.c b/lib/tc.c
> index e55ba3b1b..56f4ca7a0 100644
> --- a/lib/tc.c
> +++ b/lib/tc.c
> @@ -244,7 +244,7 @@ static void request_from_tcf_id(struct tcf_id *id, uint16_t eth_type,
>  int
>  tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
>  {
> -    int error = nl_transact(NETLINK_ROUTE, request, replyp);
> +    int error = nl_transact(NULL, NETLINK_ROUTE, request, replyp);
>      ofpbuf_uninit(request);
>      return error;
>  }
> @@ -2349,7 +2349,7 @@ tc_dump_flower_start(struct tcf_id *id, struct nl_dump *dump, bool terse)
>          nl_msg_put_unspec(&request, TCA_DUMP_FLAGS, &dump_flags,
>                            sizeof dump_flags);
>      }
> -    nl_dump_start(dump, NETLINK_ROUTE, &request);
> +    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
>      ofpbuf_uninit(&request);
>
>      return 0;
> @@ -2361,7 +2361,7 @@ tc_dump_tc_chain_start(struct tcf_id *id, struct nl_dump *dump)
>      struct ofpbuf request;
>
>      request_from_tcf_id(id, 0, RTM_GETCHAIN, NLM_F_DUMP, &request);
> -    nl_dump_start(dump, NETLINK_ROUTE, &request);
> +    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
>      ofpbuf_uninit(&request);
>
>      return 0;
> @@ -2380,7 +2380,7 @@ tc_dump_tc_action_start(char *name, struct nl_dump *dump)
>      nl_msg_end_nested(&request, offset);
>      nl_msg_end_nested(&request, root_offset);
>
> -    nl_dump_start(dump, NETLINK_ROUTE, &request);
> +    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
>      ofpbuf_uninit(&request);
>
>      return 0;
> --
Felix Huettner Nov. 19, 2024, 8:24 a.m. UTC | #2
On Thu, Nov 14, 2024 at 01:26:25PM +0100, Eelco Chaudron wrote:
> On 24 Oct 2024, at 17:45, Felix Huettner via dev wrote:
> 
> > In the future we need to access routing table entries in network
> > namespaces.
> > As netlink has no easy way to get a socket in a network namespace we
> > copy the logic of frr where we:
> > 1. enter the target netns
> > 2. open a socket
> > 3. jump back to our original netns
> >
> > the socket will keep working between these jumps.
> 
> Hi Felix,
> 

Hi Eelco,

thanks for the review.

> Thanks for the patches. I did review the series, but did not do any actual testing. It would be good to add some test cases, and add NEWS section.

I have added tests and the news entry and will share them as part of v2.

> 
> On this patch; Given the number of changes and the fact that netns is ultimately used only in three instances, I’d recommend adding a nl_ns_transact() and nl_ns_dump_start() instead.

I adapted it and it is a lot nicer now, thanks.

> 
> Some additional comments below.
> 
> Eelco
> 
> > Signed-off-by: Felix Huettner <felix.huettner@stackit.cloud>
> > ---
> >  lib/dpif-netlink-rtnl.c |   6 +--
> >  lib/dpif-netlink.c      |  22 ++++----
> >  lib/netdev-linux.c      |   6 +--
> >  lib/netlink-conntrack.c |  20 ++++----
> >  lib/netlink-socket.c    | 110 ++++++++++++++++++++++++++++++++--------
> >  lib/netlink-socket.h    |   7 +--
> >  lib/route-table.c       |   6 +--
> >  lib/tc.c                |   8 +--
> >  8 files changed, 127 insertions(+), 58 deletions(-)
> >
> > diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c
> > index f7035333e..2df905d92 100644
> > --- a/lib/dpif-netlink-rtnl.c
> > +++ b/lib/dpif-netlink-rtnl.c
> > @@ -158,7 +158,7 @@ rtnl_transact(uint32_t type, uint32_t flags, const char *name,
> >      ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
> >      nl_msg_put_string(&request, IFLA_IFNAME, name);
> >
> > -    err = nl_transact(NETLINK_ROUTE, &request, reply);
> > +    err = nl_transact(NULL, NETLINK_ROUTE, &request, reply);
> >      ofpbuf_uninit(&request);
> >
> >      return err;
> > @@ -342,7 +342,7 @@ rtnl_set_mtu(const char *name, uint32_t mtu, struct ofpbuf *request)
> >      nl_msg_put_string(request, IFLA_IFNAME, name);
> >      nl_msg_put_u32(request, IFLA_MTU, mtu);
> >
> > -    return nl_transact(NETLINK_ROUTE, request, NULL);
> > +    return nl_transact(NULL, NETLINK_ROUTE, request, NULL);
> >  }
> >
> >  static int
> > @@ -425,7 +425,7 @@ dpif_netlink_rtnl_create(const struct netdev_tunnel_config *tnl_cfg,
> >      nl_msg_end_nested(&request, infodata_off);
> >      nl_msg_end_nested(&request, linkinfo_off);
> >
> > -    err = nl_transact(NETLINK_ROUTE, &request, NULL);
> > +    err = nl_transact(NULL, NETLINK_ROUTE, &request, NULL);
> >      if (!err && (type == OVS_VPORT_TYPE_GRE ||
> >                   type == OVS_VPORT_TYPE_IP6GRE)) {
> >          /* Work around a bug in kernel GRE driver, which ignores IFLA_MTU in
> > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> > index 84e2bd8ea..3e4e03697 100644
> > --- a/lib/dpif-netlink.c
> > +++ b/lib/dpif-netlink.c
> > @@ -1348,7 +1348,7 @@ dpif_netlink_port_dump_start__(const struct dpif_netlink *dpif,
> >
> >      buf = ofpbuf_new(1024);
> >      dpif_netlink_vport_to_ofpbuf(&request, buf);
> > -    nl_dump_start(dump, NETLINK_GENERIC, buf);
> > +    nl_dump_start(NULL, dump, NETLINK_GENERIC, buf);
> >      ofpbuf_delete(buf);
> >  }
> >
> > @@ -1682,7 +1682,7 @@ dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse,
> >
> >          buf = ofpbuf_new(1024);
> >          dpif_netlink_flow_to_ofpbuf(&request, buf);
> > -        nl_dump_start(&dump->nl_dump, NETLINK_GENERIC, buf);
> > +        nl_dump_start(NULL, &dump->nl_dump, NETLINK_GENERIC, buf);
> >          ofpbuf_delete(buf);
> >      }
> >      atomic_init(&dump->status, 0);
> > @@ -2141,7 +2141,7 @@ dpif_netlink_operate__(struct dpif_netlink *dpif,
> >      for (i = 0; i < n_ops; i++) {
> >          txnsp[i] = &auxes[i].txn;
> >      }
> > -    nl_transact_multiple(NETLINK_GENERIC, txnsp, n_ops);
> > +    nl_transact_multiple(NULL, NETLINK_GENERIC, txnsp, n_ops);
> >
> >      for (i = 0; i < n_ops; i++) {
> >          struct op_auxdata *aux = &auxes[i];
> > @@ -3391,7 +3391,7 @@ dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED,
> >      }
> >      nl_msg_end_nested(request, opt_offset);
> >
> > -    int err = nl_transact(NETLINK_GENERIC, request, NULL);
> > +    int err = nl_transact(NULL, NETLINK_GENERIC, request, NULL);
> >      ofpbuf_delete(request);
> >      return err;
> >  }
> > @@ -3476,7 +3476,7 @@ dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED,
> >      }
> >
> >      struct ofpbuf *reply;
> > -    int err = nl_transact(NETLINK_GENERIC, request, &reply);
> > +    int err = nl_transact(NULL, NETLINK_GENERIC, request, &reply);
> >      if (err) {
> >          goto out;
> >      }
> > @@ -3520,7 +3520,7 @@ dpif_netlink_ct_del_limits(struct dpif *dpif OVS_UNUSED,
> >          nl_msg_end_nested(request, opt_offset);
> >      }
> >
> > -    int err = nl_transact(NETLINK_GENERIC, request, NULL);
> > +    int err = nl_transact(NULL, NETLINK_GENERIC, request, NULL);
> >
> >      ofpbuf_delete(request);
> >      return err;
> > @@ -4066,7 +4066,7 @@ dpif_netlink_meter_transact(struct ofpbuf *request, struct ofpbuf **replyp,
> >                              const struct nl_policy *reply_policy,
> >                              struct nlattr **a, size_t size_a)
> >  {
> > -    int error = nl_transact(NETLINK_GENERIC, request, replyp);
> > +    int error = nl_transact(NULL, NETLINK_GENERIC, request, replyp);
> >      ofpbuf_uninit(request);
> >
> >      if (error) {
> > @@ -4818,7 +4818,7 @@ dpif_netlink_vport_transact(const struct dpif_netlink_vport *request,
> >
> >      request_buf = ofpbuf_new(1024);
> >      dpif_netlink_vport_to_ofpbuf(request, request_buf);
> > -    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
> > +    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
> >      ofpbuf_delete(request_buf);
> >
> >      if (reply) {
> > @@ -4969,7 +4969,7 @@ dpif_netlink_dp_dump_start(struct nl_dump *dump)
> >
> >      buf = ofpbuf_new(1024);
> >      dpif_netlink_dp_to_ofpbuf(&request, buf);
> > -    nl_dump_start(dump, NETLINK_GENERIC, buf);
> > +    nl_dump_start(NULL, dump, NETLINK_GENERIC, buf);
> >      ofpbuf_delete(buf);
> >  }
> >
> > @@ -4990,7 +4990,7 @@ dpif_netlink_dp_transact(const struct dpif_netlink_dp *request,
> >
> >      request_buf = ofpbuf_new(1024);
> >      dpif_netlink_dp_to_ofpbuf(request, request_buf);
> > -    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
> > +    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
> >      ofpbuf_delete(request_buf);
> >
> >      if (reply) {
> > @@ -5223,7 +5223,7 @@ dpif_netlink_flow_transact(struct dpif_netlink_flow *request,
> >
> >      request_buf = ofpbuf_new(1024);
> >      dpif_netlink_flow_to_ofpbuf(request, request_buf);
> > -    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
> > +    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
> >      ofpbuf_delete(request_buf);
> >
> >      if (reply) {
> > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> > index 0cd0850a3..2353ee1a2 100644
> > --- a/lib/netdev-linux.c
> > +++ b/lib/netdev-linux.c
> > @@ -3358,7 +3358,7 @@ start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
> >          return false;
> >      }
> >      tcmsg->tcm_parent = 0;
> > -    nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
> > +    nl_dump_start(NULL, &state->dump, NETLINK_ROUTE, &request);
> >      ofpbuf_uninit(&request);
> >
> >      ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
> > @@ -6722,7 +6722,7 @@ get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
> >                          RTM_GETLINK, NLM_F_REQUEST);
> >      ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
> >      nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
> > -    error = nl_transact(NETLINK_ROUTE, &request, &reply);
> > +    error = nl_transact(NULL, NETLINK_ROUTE, &request, &reply);
> >      ofpbuf_uninit(&request);
> >      if (error) {
> >          return error;
> > @@ -6858,7 +6858,7 @@ netdev_linux_update_via_netlink(struct netdev_linux *netdev)
> >
> >      nl_msg_put_u32(&request, IFLA_EXT_MASK, RTEXT_FILTER_SKIP_STATS);
> >
> > -    error = nl_transact(NETLINK_ROUTE, &request, &reply);
> > +    error = nl_transact(NULL, NETLINK_ROUTE, &request, &reply);
> >      ofpbuf_uninit(&request);
> >      if (error) {
> >          ofpbuf_delete(reply);
> > diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c
> > index 0b3a8adf5..79ca17ec9 100644
> > --- a/lib/netlink-conntrack.c
> > +++ b/lib/netlink-conntrack.c
> > @@ -144,7 +144,7 @@ nl_ct_dump_start(struct nl_ct_dump_state **statep, const uint16_t *zone,
> >      if (zone) {
> >          nl_msg_put_be16(&state->buf, CTA_ZONE, htons(*zone));
> >      }
> > -    nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf);
> > +    nl_dump_start(NULL, &state->dump, NETLINK_NETFILTER, &state->buf);
> >      ofpbuf_clear(&state->buf);
> >
> >      /* Buckets to store connections are not used. */
> > @@ -236,7 +236,7 @@ nl_ct_flush(void)
> >      nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
> >                          IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
> >
> > -    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> > +    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
> >      ofpbuf_uninit(&buf);
> >
> >      /* Expectations are flushed automatically, because they do not
> > @@ -260,7 +260,7 @@ nl_ct_flush_tuple(const struct ct_dpif_tuple *tuple, uint16_t zone)
> >          err = EOPNOTSUPP;
> >          goto out;
> >      }
> > -    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> > +    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
> >  out:
> >      ofpbuf_uninit(&buf);
> >      return err;
> > @@ -278,7 +278,7 @@ nl_ct_flush_zone_with_cta_zone(uint16_t flush_zone)
> >                          IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
> >      nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone));
> >
> > -    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> > +    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
> >      ofpbuf_uninit(&buf);
> >
> >      return err;
> > @@ -340,7 +340,7 @@ nl_ct_flush_zone(uint16_t flush_zone)
> >      nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
> >                          IPCTNL_MSG_CT_GET, NLM_F_REQUEST);
> >      nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone));
> > -    nl_dump_start(&dump, NETLINK_NETFILTER, &buf);
> > +    nl_dump_start(NULL, &dump, NETLINK_NETFILTER, &buf);
> >      ofpbuf_clear(&buf);
> >
> >      for (;;) {
> > @@ -374,7 +374,7 @@ nl_ct_flush_zone(uint16_t flush_zone)
> >                            attrs[CTA_TUPLE_ORIG]->nla_len - NLA_HDRLEN);
> >          nl_msg_put_unspec(&delete, CTA_ID, attrs[CTA_ID] + 1,
> >                            attrs[CTA_ID]->nla_len - NLA_HDRLEN);
> > -        nl_transact(NETLINK_NETFILTER, &delete, NULL);
> > +        nl_transact(NULL, NETLINK_NETFILTER, &delete, NULL);
> >          ofpbuf_clear(&delete);
> >      }
> >
> > @@ -1101,7 +1101,7 @@ nl_ct_set_timeout_policy(const struct nl_ct_timeout_policy *nl_tp)
> >      }
> >      nl_msg_end_nested(&buf, offset);
> >
> > -    int err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> > +    int err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
> >      ofpbuf_uninit(&buf);
> >      return err;
> >  }
> > @@ -1116,7 +1116,7 @@ nl_ct_get_timeout_policy(const char *tp_name,
> >      nl_msg_put_nfgenmsg(&request, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK_TIMEOUT,
> >                          IPCTNL_MSG_TIMEOUT_GET, NLM_F_REQUEST | NLM_F_ACK);
> >      nl_msg_put_string(&request, CTA_TIMEOUT_NAME, tp_name);
> > -    int err = nl_transact(NETLINK_NETFILTER, &request, &reply);
> > +    int err = nl_transact(NULL, NETLINK_NETFILTER, &request, &reply);
> >      if (err) {
> >          goto out;
> >      }
> > @@ -1139,7 +1139,7 @@ nl_ct_del_timeout_policy(const char *tp_name)
> >                          IPCTNL_MSG_TIMEOUT_DELETE, NLM_F_REQUEST | NLM_F_ACK);
> >
> >      nl_msg_put_string(&buf, CTA_TIMEOUT_NAME, tp_name);
> > -    int err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> > +    int err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
> >      ofpbuf_uninit(&buf);
> >      return err;
> >  }
> > @@ -1162,7 +1162,7 @@ nl_ct_timeout_policy_dump_start(
> >                          IPCTNL_MSG_TIMEOUT_GET,
> >                          NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP);
> >
> > -    nl_dump_start(&state->dump, NETLINK_NETFILTER, &request);
> > +    nl_dump_start(NULL, &state->dump, NETLINK_NETFILTER, &request);
> >      ofpbuf_uninit(&request);
> >      ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
> >      return 0;
> > diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
> > index 976ed15e8..801b4badd 100644
> > --- a/lib/netlink-socket.c
> > +++ b/lib/netlink-socket.c
> > @@ -17,6 +17,7 @@
> >  #include <config.h>
> >  #include "netlink-socket.h"
> >  #include <errno.h>
> > +#include <fcntl.h>
> >  #include <inttypes.h>
> >  #include <stdlib.h>
> >  #include <sys/socket.h>
> > @@ -25,6 +26,7 @@
> >  #include <unistd.h>
> >  #include "coverage.h"
> >  #include "openvswitch/dynamic-string.h"
> > +#include "openvswitch/shash.h"
> >  #include "hash.h"
> >  #include "openvswitch/hmap.h"
> >  #include "netlink.h"
> > @@ -94,6 +96,7 @@ struct nl_sock {
> >      uint32_t pid;
> >      int protocol;
> >      unsigned int rcvbuf;        /* Receive buffer size (SO_RCVBUF). */
> > +    char *netns;
> >  };
> >
> >  /* Compile-time limit on iovecs, so that we can allocate a maximum-size array
> > @@ -106,7 +109,7 @@ struct nl_sock {
> >   * Initialized by nl_sock_create(). */
> >  static int max_iovs;
> >
> > -static int nl_pool_alloc(int protocol, struct nl_sock **sockp);
> > +static int nl_pool_alloc(const char *, int protocol, struct nl_sock **sockp);
> >  static void nl_pool_release(struct nl_sock *);
> >
> >  /* Creates a new netlink socket for the given netlink 'protocol'
> > @@ -145,6 +148,7 @@ nl_sock_create(int protocol, struct nl_sock **sockp)
> >
> >      *sockp = NULL;
> >      sock = xmalloc(sizeof *sock);
> > +    sock->netns = NULL;
> >
> >  #ifdef _WIN32
> >      sock->overlapped.hEvent = NULL;
> > @@ -294,6 +298,7 @@ nl_sock_destroy(struct nl_sock *sock)
> >  #else
> >          close(sock->fd);
> >  #endif
> > +        free(sock->netns);
> >          free(sock);
> >      }
> >  }
> > @@ -1152,6 +1157,9 @@ nl_sock_drain(struct nl_sock *sock)
> >   * Netlink socket created with the given 'protocol', and initializes 'dump' to
> >   * reflect the state of the operation.
> >   *
> > + * The dump runs in the specified 'netns'. If 'netns' is NULL the current will
> > + * be used.
> > + *
> >   * 'request' must contain a Netlink message.  Before sending the message,
> >   * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
> >   * set to the Netlink socket's pid.  NLM_F_DUMP and NLM_F_ACK will be set in
> > @@ -1165,13 +1173,14 @@ nl_sock_drain(struct nl_sock *sock)
> >   * The caller must eventually destroy 'request'.
> >   */
> >  void
> > -nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
> > +nl_dump_start(const char *netns, struct nl_dump *dump, int protocol,
> > +              const struct ofpbuf *request)
> >  {
> >      nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
> >
> >      ovs_mutex_init(&dump->mutex);
> >      ovs_mutex_lock(&dump->mutex);
> > -    dump->status = nl_pool_alloc(protocol, &dump->sock);
> > +    dump->status = nl_pool_alloc(netns, protocol, &dump->sock);
> >      if (!dump->status) {
> >          dump->status = nl_sock_send__(dump->sock, request,
> >                                        nl_sock_allocate_seq(dump->sock, 1),
> > @@ -1725,39 +1734,95 @@ struct nl_pool {
> >      int n;
> >  };
> >
> > +struct nl_ns_pool {
> > +    struct nl_pool pools[MAX_LINKS];
> > +};
> > +
> >  static struct ovs_mutex pool_mutex = OVS_MUTEX_INITIALIZER;
> > -static struct nl_pool pools[MAX_LINKS] OVS_GUARDED_BY(pool_mutex);
> > +static struct shash pools OVS_GUARDED_BY(pool_mutex) =
> > +                                SHASH_INITIALIZER(&pools);
> >
> >  static int
> > -nl_pool_alloc(int protocol, struct nl_sock **sockp)
> > +nl_sock_ns_create(const char *netns, int protocol, struct nl_sock **sockp) {
> > +    int ret, ns_fd, ns_default_fd, err;
> > +    if (netns) {
> 
> How will this all work on BSD and Windows? Guess we need a proper return EOPNOTSUPP or something?

I added something for that.

> 
> > +        ns_default_fd = open("/proc/self/ns/net", O_RDONLY);
> > +        if (ns_default_fd < 0) {
> > +            VLOG_ERR("something wrong when opening self net fd, %d\n",
> > +                     errno);
> > +            return -errno;
> > +        }
> > +        char *netns_path = xasprintf("/var/run/netns/%s", netns);
> > +        ns_fd = open(netns_path, O_RDONLY);
> > +        free(netns_path);
> > +        if (ns_fd < 0) {
> > +            VLOG_ERR("something wrong when opening other net fd, %d\n",
> > +                     errno);
> 
> Should we include netns_path in the error log message?

Yes, added it.

> 
> Should we close ns_default_fd ?

It is closed below after nl_sock_create.
Basically it is our FD to get back to the original namespace.

> 
> > +            return -errno;
> > +        }
> > +        err = setns(ns_fd, CLONE_NEWNET);
> > +        if (err < 0) {
> > +            VLOG_ABORT("something wrong during setns to target, %d\n",
> > +                       errno);
> 
> And abort is crashing OVS/OVN, are we sure we want to do this? What if someone decides to remove a netns, it might crash OVS.

Ok so i just checked the kernel sources.
When calling setns if there is any error returned we are still in the
original namespace. Which means that here we can still savely get out of
this situation by returning an error.

However in the setns call below we will need to abort. In that case we
would be stuck in the netns we where temporarily in and have lost our
chance to jump back.
This should not actually ever happen, since we know that the namespace
is valid when we create it above.

> 
> > +        }
> > +        close(ns_fd);
> > +    }
> > +    ret = nl_sock_create(protocol, sockp);
> > +    if (netns) {
> > +        err = setns(ns_default_fd, CLONE_NEWNET);
> > +        if (err < 0) {
> > +            VLOG_ABORT("something wrong during setns to home, %d\n",
> > +                       errno);
> > +        }
> > +        close(ns_default_fd);
> > +        if (*sockp) {
> > +            (*sockp)->netns = xstrdup(netns);
> > +        }
> > +    }
> > +    return ret;
> > +}
> > +
> > +static int
> > +nl_pool_alloc(const char *netns, int protocol, struct nl_sock **sockp)
> >  {
> >      struct nl_sock *sock = NULL;
> >      struct nl_pool *pool;
> >
> > -    ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(pools));
> > +    ovs_assert(protocol >= 0 && protocol < MAX_LINKS);
> 
> Maybe ARRAY_SIZE(ns_pools->pools)

Thanks fixed.

> 
> >
> >      ovs_mutex_lock(&pool_mutex);
> > -    pool = &pools[protocol];
> > -    if (pool->n > 0) {
> > -        sock = pool->socks[--pool->n];
> > +    const char * netns_name = netns ? netns : "";
> 
> Can you move this to the top of the function, with the other definitions?

Done

> 
> > +
> > +    struct nl_ns_pool *ns_pools = shash_find_data(&pools, netns_name);
> > +    if (ns_pools) {
> > +        pool = &ns_pools->pools[protocol];
> > +        if (pool->n > 0) {
> > +            sock = pool->socks[--pool->n];
> > +        }
> > +
> > +        if (sock) {
> > +            ovs_mutex_unlock(&pool_mutex);
> > +            *sockp = sock;
> > +            return 0;
> > +        }
> >      }
> >      ovs_mutex_unlock(&pool_mutex);
> 
> Can we re-arrange the function a bit so we only have a single unlock()?
> Something like (not even compile tested):
> 
> static int
> nl_pool_alloc(const char *netns, int protocol, struct nl_sock **sockp)
> {
>     const char * netns_name = netns ? netns : "";
>     struct nl_ns_pool *ns_pools;
>     struct nl_sock *sock = NULL;
>     struct nl_pool *pool;
> 
>     ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(ns_pools->pools));
> 
>     ovs_mutex_lock(&pool_mutex);
>     ns_pools = shash_find_data(&pools, netns_name);
>     if (ns_pools) {
>         pool = &ns_pools->pools[protocol];
>         if (pool->n > 0) {
>             sock = pool->socks[--pool->n];
>         }
>     }
>     ovs_mutex_unlock(&pool_mutex);
> 
>     if (sock) {
>         *sockp = sock;
>         return 0;
>     }
>     return nl_sock_ns_create(netns, protocol, sockp);
> }

Thanks a lot. I'll take that.

> 
> > -
> > -    if (sock) {
> > -        *sockp = sock;
> > -        return 0;
> > -    } else {
> > -        return nl_sock_create(protocol, sockp);
> > -    }
> > +    return nl_sock_ns_create(netns, protocol, sockp);
> >  }
> >
> >  static void
> >  nl_pool_release(struct nl_sock *sock)
> >  {
> >      if (sock) {
> > -        struct nl_pool *pool = &pools[sock->protocol];
> > +        const char * netns_name = sock->netns ? sock->netns : "";
> >
> >          ovs_mutex_lock(&pool_mutex);
> > +        struct nl_ns_pool *ns_pools = shash_find_data(&pools, netns_name);
> > +        if (!ns_pools) {
> > +            ns_pools = xzalloc(sizeof(*ns_pools));
> > +            shash_add(&pools, netns_name, ns_pools);
> > +        }
> > +        struct nl_pool *pool = &ns_pools->pools[sock->protocol];
> >          if (pool->n < ARRAY_SIZE(pool->socks)) {
> >              pool->socks[pool->n++] = sock;
> >              sock = NULL;
> > @@ -1772,6 +1837,9 @@ nl_pool_release(struct nl_sock *sock)
> >   * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response.  If
> >   * successful, returns 0.  On failure, returns a positive errno value.
> >   *
> > + * The 'request' is send in the specified netns. If 'netns' is NULL the current
> > + * namespace is used.
> > + *
> >   * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
> >   * reply, which the caller is responsible for freeing with ofpbuf_delete(), and
> >   * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the kernel's
> > @@ -1810,13 +1878,13 @@ nl_pool_release(struct nl_sock *sock)
> >   *         needs to be idempotent.
> >   */
> >  int
> > -nl_transact(int protocol, const struct ofpbuf *request,
> > +nl_transact(const char *netns, int protocol, const struct ofpbuf *request,
> >              struct ofpbuf **replyp)
> >  {
> >      struct nl_sock *sock;
> >      int error;
> >
> > -    error = nl_pool_alloc(protocol, &sock);
> > +    error = nl_pool_alloc(netns, protocol, &sock);
> >      if (error) {
> >          if (replyp) {
> >              *replyp = NULL;
> > @@ -1851,13 +1919,13 @@ nl_transact(int protocol, const struct ofpbuf *request,
> >   * nl_transact() for some caveats.
> >   */
> >  void
> > -nl_transact_multiple(int protocol,
> > +nl_transact_multiple(const char *netns, int protocol,
> >                       struct nl_transaction **transactions, size_t n)
> >  {
> >      struct nl_sock *sock;
> >      int error;
> >
> > -    error = nl_pool_alloc(protocol, &sock);
> > +    error = nl_pool_alloc(netns, protocol, &sock);
> >      if (!error) {
> >          nl_sock_transact_multiple(sock, transactions, n);
> >          nl_pool_release(sock);
> > diff --git a/lib/netlink-socket.h b/lib/netlink-socket.h
> > index 7852ad052..e4d645a62 100644
> > --- a/lib/netlink-socket.h
> > +++ b/lib/netlink-socket.h
> > @@ -251,9 +251,10 @@ struct nl_transaction {
> >  };
> >
> >  /* Transactions without an allocated socket. */
> > -int nl_transact(int protocol, const struct ofpbuf *request,
> > +int nl_transact(const char *netns, int protocol, const struct ofpbuf *request,
> >                  struct ofpbuf **replyp);
> > -void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n);
> > +void nl_transact_multiple(const char *netns, int protocol,
> > +                          struct nl_transaction **, size_t n);
> >
> >  /* Table dumping. */
> >  #define NL_DUMP_BUFSIZE         4096
> > @@ -270,7 +271,7 @@ struct nl_dump {
> >      struct ovs_mutex mutex;     /* Protects 'status', synchronizes recv(). */
> >  };
> >
> > -void nl_dump_start(struct nl_dump *, int protocol,
> > +void nl_dump_start(const char *netns, struct nl_dump *dump, int protocol,
> >                     const struct ofpbuf *request);
> >  bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf);
> >  int nl_dump_done(struct nl_dump *);
> > diff --git a/lib/route-table.c b/lib/route-table.c
> > index c6cb21394..d5d98585e 100644
> > --- a/lib/route-table.c
> > +++ b/lib/route-table.c
> > @@ -156,7 +156,7 @@ route_table_wait(void)
> >  }
> >
> >  static bool
> > -route_table_dump_one_table(unsigned char id)
> > +route_table_dump_one_table(const char *netns, unsigned char id)
> >  {
> >      uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
> >      struct ofpbuf request, reply, buf;
> > @@ -172,7 +172,7 @@ route_table_dump_one_table(unsigned char id)
> >      rq_msg->rtm_family = AF_UNSPEC;
> >      rq_msg->rtm_table = id;
> >
> > -    nl_dump_start(&dump, NETLINK_ROUTE, &request);
> > +    nl_dump_start(netns, &dump, NETLINK_ROUTE, &request);
> >      ofpbuf_uninit(&request);
> >
> >      ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
> > @@ -212,7 +212,7 @@ route_table_reset(void)
> >      COVERAGE_INC(route_table_dump);
> >
> >      for (size_t i = 0; i < ARRAY_SIZE(tables); i++) {
> > -        if (!route_table_dump_one_table(tables[i])) {
> > +        if (!route_table_dump_one_table(NULL, tables[i])) {
> >              /* Got unfiltered reply, no need to dump further. */
> >              break;
> >          }
> > diff --git a/lib/tc.c b/lib/tc.c
> > index e55ba3b1b..56f4ca7a0 100644
> > --- a/lib/tc.c
> > +++ b/lib/tc.c
> > @@ -244,7 +244,7 @@ static void request_from_tcf_id(struct tcf_id *id, uint16_t eth_type,
> >  int
> >  tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
> >  {
> > -    int error = nl_transact(NETLINK_ROUTE, request, replyp);
> > +    int error = nl_transact(NULL, NETLINK_ROUTE, request, replyp);
> >      ofpbuf_uninit(request);
> >      return error;
> >  }
> > @@ -2349,7 +2349,7 @@ tc_dump_flower_start(struct tcf_id *id, struct nl_dump *dump, bool terse)
> >          nl_msg_put_unspec(&request, TCA_DUMP_FLAGS, &dump_flags,
> >                            sizeof dump_flags);
> >      }
> > -    nl_dump_start(dump, NETLINK_ROUTE, &request);
> > +    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
> >      ofpbuf_uninit(&request);
> >
> >      return 0;
> > @@ -2361,7 +2361,7 @@ tc_dump_tc_chain_start(struct tcf_id *id, struct nl_dump *dump)
> >      struct ofpbuf request;
> >
> >      request_from_tcf_id(id, 0, RTM_GETCHAIN, NLM_F_DUMP, &request);
> > -    nl_dump_start(dump, NETLINK_ROUTE, &request);
> > +    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
> >      ofpbuf_uninit(&request);
> >
> >      return 0;
> > @@ -2380,7 +2380,7 @@ tc_dump_tc_action_start(char *name, struct nl_dump *dump)
> >      nl_msg_end_nested(&request, offset);
> >      nl_msg_end_nested(&request, root_offset);
> >
> > -    nl_dump_start(dump, NETLINK_ROUTE, &request);
> > +    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
> >      ofpbuf_uninit(&request);
> >
> >      return 0;
> > -- 
>
diff mbox series

Patch

diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c
index f7035333e..2df905d92 100644
--- a/lib/dpif-netlink-rtnl.c
+++ b/lib/dpif-netlink-rtnl.c
@@ -158,7 +158,7 @@  rtnl_transact(uint32_t type, uint32_t flags, const char *name,
     ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
     nl_msg_put_string(&request, IFLA_IFNAME, name);
 
-    err = nl_transact(NETLINK_ROUTE, &request, reply);
+    err = nl_transact(NULL, NETLINK_ROUTE, &request, reply);
     ofpbuf_uninit(&request);
 
     return err;
@@ -342,7 +342,7 @@  rtnl_set_mtu(const char *name, uint32_t mtu, struct ofpbuf *request)
     nl_msg_put_string(request, IFLA_IFNAME, name);
     nl_msg_put_u32(request, IFLA_MTU, mtu);
 
-    return nl_transact(NETLINK_ROUTE, request, NULL);
+    return nl_transact(NULL, NETLINK_ROUTE, request, NULL);
 }
 
 static int
@@ -425,7 +425,7 @@  dpif_netlink_rtnl_create(const struct netdev_tunnel_config *tnl_cfg,
     nl_msg_end_nested(&request, infodata_off);
     nl_msg_end_nested(&request, linkinfo_off);
 
-    err = nl_transact(NETLINK_ROUTE, &request, NULL);
+    err = nl_transact(NULL, NETLINK_ROUTE, &request, NULL);
     if (!err && (type == OVS_VPORT_TYPE_GRE ||
                  type == OVS_VPORT_TYPE_IP6GRE)) {
         /* Work around a bug in kernel GRE driver, which ignores IFLA_MTU in
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 84e2bd8ea..3e4e03697 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -1348,7 +1348,7 @@  dpif_netlink_port_dump_start__(const struct dpif_netlink *dpif,
 
     buf = ofpbuf_new(1024);
     dpif_netlink_vport_to_ofpbuf(&request, buf);
-    nl_dump_start(dump, NETLINK_GENERIC, buf);
+    nl_dump_start(NULL, dump, NETLINK_GENERIC, buf);
     ofpbuf_delete(buf);
 }
 
@@ -1682,7 +1682,7 @@  dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse,
 
         buf = ofpbuf_new(1024);
         dpif_netlink_flow_to_ofpbuf(&request, buf);
-        nl_dump_start(&dump->nl_dump, NETLINK_GENERIC, buf);
+        nl_dump_start(NULL, &dump->nl_dump, NETLINK_GENERIC, buf);
         ofpbuf_delete(buf);
     }
     atomic_init(&dump->status, 0);
@@ -2141,7 +2141,7 @@  dpif_netlink_operate__(struct dpif_netlink *dpif,
     for (i = 0; i < n_ops; i++) {
         txnsp[i] = &auxes[i].txn;
     }
-    nl_transact_multiple(NETLINK_GENERIC, txnsp, n_ops);
+    nl_transact_multiple(NULL, NETLINK_GENERIC, txnsp, n_ops);
 
     for (i = 0; i < n_ops; i++) {
         struct op_auxdata *aux = &auxes[i];
@@ -3391,7 +3391,7 @@  dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED,
     }
     nl_msg_end_nested(request, opt_offset);
 
-    int err = nl_transact(NETLINK_GENERIC, request, NULL);
+    int err = nl_transact(NULL, NETLINK_GENERIC, request, NULL);
     ofpbuf_delete(request);
     return err;
 }
@@ -3476,7 +3476,7 @@  dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED,
     }
 
     struct ofpbuf *reply;
-    int err = nl_transact(NETLINK_GENERIC, request, &reply);
+    int err = nl_transact(NULL, NETLINK_GENERIC, request, &reply);
     if (err) {
         goto out;
     }
@@ -3520,7 +3520,7 @@  dpif_netlink_ct_del_limits(struct dpif *dpif OVS_UNUSED,
         nl_msg_end_nested(request, opt_offset);
     }
 
-    int err = nl_transact(NETLINK_GENERIC, request, NULL);
+    int err = nl_transact(NULL, NETLINK_GENERIC, request, NULL);
 
     ofpbuf_delete(request);
     return err;
@@ -4066,7 +4066,7 @@  dpif_netlink_meter_transact(struct ofpbuf *request, struct ofpbuf **replyp,
                             const struct nl_policy *reply_policy,
                             struct nlattr **a, size_t size_a)
 {
-    int error = nl_transact(NETLINK_GENERIC, request, replyp);
+    int error = nl_transact(NULL, NETLINK_GENERIC, request, replyp);
     ofpbuf_uninit(request);
 
     if (error) {
@@ -4818,7 +4818,7 @@  dpif_netlink_vport_transact(const struct dpif_netlink_vport *request,
 
     request_buf = ofpbuf_new(1024);
     dpif_netlink_vport_to_ofpbuf(request, request_buf);
-    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
+    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
     ofpbuf_delete(request_buf);
 
     if (reply) {
@@ -4969,7 +4969,7 @@  dpif_netlink_dp_dump_start(struct nl_dump *dump)
 
     buf = ofpbuf_new(1024);
     dpif_netlink_dp_to_ofpbuf(&request, buf);
-    nl_dump_start(dump, NETLINK_GENERIC, buf);
+    nl_dump_start(NULL, dump, NETLINK_GENERIC, buf);
     ofpbuf_delete(buf);
 }
 
@@ -4990,7 +4990,7 @@  dpif_netlink_dp_transact(const struct dpif_netlink_dp *request,
 
     request_buf = ofpbuf_new(1024);
     dpif_netlink_dp_to_ofpbuf(request, request_buf);
-    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
+    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
     ofpbuf_delete(request_buf);
 
     if (reply) {
@@ -5223,7 +5223,7 @@  dpif_netlink_flow_transact(struct dpif_netlink_flow *request,
 
     request_buf = ofpbuf_new(1024);
     dpif_netlink_flow_to_ofpbuf(request, request_buf);
-    error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
+    error = nl_transact(NULL, NETLINK_GENERIC, request_buf, bufp);
     ofpbuf_delete(request_buf);
 
     if (reply) {
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 0cd0850a3..2353ee1a2 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -3358,7 +3358,7 @@  start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
         return false;
     }
     tcmsg->tcm_parent = 0;
-    nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
+    nl_dump_start(NULL, &state->dump, NETLINK_ROUTE, &request);
     ofpbuf_uninit(&request);
 
     ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
@@ -6722,7 +6722,7 @@  get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
                         RTM_GETLINK, NLM_F_REQUEST);
     ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
     nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
-    error = nl_transact(NETLINK_ROUTE, &request, &reply);
+    error = nl_transact(NULL, NETLINK_ROUTE, &request, &reply);
     ofpbuf_uninit(&request);
     if (error) {
         return error;
@@ -6858,7 +6858,7 @@  netdev_linux_update_via_netlink(struct netdev_linux *netdev)
 
     nl_msg_put_u32(&request, IFLA_EXT_MASK, RTEXT_FILTER_SKIP_STATS);
 
-    error = nl_transact(NETLINK_ROUTE, &request, &reply);
+    error = nl_transact(NULL, NETLINK_ROUTE, &request, &reply);
     ofpbuf_uninit(&request);
     if (error) {
         ofpbuf_delete(reply);
diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c
index 0b3a8adf5..79ca17ec9 100644
--- a/lib/netlink-conntrack.c
+++ b/lib/netlink-conntrack.c
@@ -144,7 +144,7 @@  nl_ct_dump_start(struct nl_ct_dump_state **statep, const uint16_t *zone,
     if (zone) {
         nl_msg_put_be16(&state->buf, CTA_ZONE, htons(*zone));
     }
-    nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf);
+    nl_dump_start(NULL, &state->dump, NETLINK_NETFILTER, &state->buf);
     ofpbuf_clear(&state->buf);
 
     /* Buckets to store connections are not used. */
@@ -236,7 +236,7 @@  nl_ct_flush(void)
     nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
                         IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
 
-    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
+    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
     ofpbuf_uninit(&buf);
 
     /* Expectations are flushed automatically, because they do not
@@ -260,7 +260,7 @@  nl_ct_flush_tuple(const struct ct_dpif_tuple *tuple, uint16_t zone)
         err = EOPNOTSUPP;
         goto out;
     }
-    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
+    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
 out:
     ofpbuf_uninit(&buf);
     return err;
@@ -278,7 +278,7 @@  nl_ct_flush_zone_with_cta_zone(uint16_t flush_zone)
                         IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
     nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone));
 
-    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
+    err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
     ofpbuf_uninit(&buf);
 
     return err;
@@ -340,7 +340,7 @@  nl_ct_flush_zone(uint16_t flush_zone)
     nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
                         IPCTNL_MSG_CT_GET, NLM_F_REQUEST);
     nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone));
-    nl_dump_start(&dump, NETLINK_NETFILTER, &buf);
+    nl_dump_start(NULL, &dump, NETLINK_NETFILTER, &buf);
     ofpbuf_clear(&buf);
 
     for (;;) {
@@ -374,7 +374,7 @@  nl_ct_flush_zone(uint16_t flush_zone)
                           attrs[CTA_TUPLE_ORIG]->nla_len - NLA_HDRLEN);
         nl_msg_put_unspec(&delete, CTA_ID, attrs[CTA_ID] + 1,
                           attrs[CTA_ID]->nla_len - NLA_HDRLEN);
-        nl_transact(NETLINK_NETFILTER, &delete, NULL);
+        nl_transact(NULL, NETLINK_NETFILTER, &delete, NULL);
         ofpbuf_clear(&delete);
     }
 
@@ -1101,7 +1101,7 @@  nl_ct_set_timeout_policy(const struct nl_ct_timeout_policy *nl_tp)
     }
     nl_msg_end_nested(&buf, offset);
 
-    int err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
+    int err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
     ofpbuf_uninit(&buf);
     return err;
 }
@@ -1116,7 +1116,7 @@  nl_ct_get_timeout_policy(const char *tp_name,
     nl_msg_put_nfgenmsg(&request, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK_TIMEOUT,
                         IPCTNL_MSG_TIMEOUT_GET, NLM_F_REQUEST | NLM_F_ACK);
     nl_msg_put_string(&request, CTA_TIMEOUT_NAME, tp_name);
-    int err = nl_transact(NETLINK_NETFILTER, &request, &reply);
+    int err = nl_transact(NULL, NETLINK_NETFILTER, &request, &reply);
     if (err) {
         goto out;
     }
@@ -1139,7 +1139,7 @@  nl_ct_del_timeout_policy(const char *tp_name)
                         IPCTNL_MSG_TIMEOUT_DELETE, NLM_F_REQUEST | NLM_F_ACK);
 
     nl_msg_put_string(&buf, CTA_TIMEOUT_NAME, tp_name);
-    int err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
+    int err = nl_transact(NULL, NETLINK_NETFILTER, &buf, NULL);
     ofpbuf_uninit(&buf);
     return err;
 }
@@ -1162,7 +1162,7 @@  nl_ct_timeout_policy_dump_start(
                         IPCTNL_MSG_TIMEOUT_GET,
                         NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP);
 
-    nl_dump_start(&state->dump, NETLINK_NETFILTER, &request);
+    nl_dump_start(NULL, &state->dump, NETLINK_NETFILTER, &request);
     ofpbuf_uninit(&request);
     ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
     return 0;
diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
index 976ed15e8..801b4badd 100644
--- a/lib/netlink-socket.c
+++ b/lib/netlink-socket.c
@@ -17,6 +17,7 @@ 
 #include <config.h>
 #include "netlink-socket.h"
 #include <errno.h>
+#include <fcntl.h>
 #include <inttypes.h>
 #include <stdlib.h>
 #include <sys/socket.h>
@@ -25,6 +26,7 @@ 
 #include <unistd.h>
 #include "coverage.h"
 #include "openvswitch/dynamic-string.h"
+#include "openvswitch/shash.h"
 #include "hash.h"
 #include "openvswitch/hmap.h"
 #include "netlink.h"
@@ -94,6 +96,7 @@  struct nl_sock {
     uint32_t pid;
     int protocol;
     unsigned int rcvbuf;        /* Receive buffer size (SO_RCVBUF). */
+    char *netns;
 };
 
 /* Compile-time limit on iovecs, so that we can allocate a maximum-size array
@@ -106,7 +109,7 @@  struct nl_sock {
  * Initialized by nl_sock_create(). */
 static int max_iovs;
 
-static int nl_pool_alloc(int protocol, struct nl_sock **sockp);
+static int nl_pool_alloc(const char *, int protocol, struct nl_sock **sockp);
 static void nl_pool_release(struct nl_sock *);
 
 /* Creates a new netlink socket for the given netlink 'protocol'
@@ -145,6 +148,7 @@  nl_sock_create(int protocol, struct nl_sock **sockp)
 
     *sockp = NULL;
     sock = xmalloc(sizeof *sock);
+    sock->netns = NULL;
 
 #ifdef _WIN32
     sock->overlapped.hEvent = NULL;
@@ -294,6 +298,7 @@  nl_sock_destroy(struct nl_sock *sock)
 #else
         close(sock->fd);
 #endif
+        free(sock->netns);
         free(sock);
     }
 }
@@ -1152,6 +1157,9 @@  nl_sock_drain(struct nl_sock *sock)
  * Netlink socket created with the given 'protocol', and initializes 'dump' to
  * reflect the state of the operation.
  *
+ * The dump runs in the specified 'netns'. If 'netns' is NULL the current will
+ * be used.
+ *
  * 'request' must contain a Netlink message.  Before sending the message,
  * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
  * set to the Netlink socket's pid.  NLM_F_DUMP and NLM_F_ACK will be set in
@@ -1165,13 +1173,14 @@  nl_sock_drain(struct nl_sock *sock)
  * The caller must eventually destroy 'request'.
  */
 void
-nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
+nl_dump_start(const char *netns, struct nl_dump *dump, int protocol,
+              const struct ofpbuf *request)
 {
     nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
 
     ovs_mutex_init(&dump->mutex);
     ovs_mutex_lock(&dump->mutex);
-    dump->status = nl_pool_alloc(protocol, &dump->sock);
+    dump->status = nl_pool_alloc(netns, protocol, &dump->sock);
     if (!dump->status) {
         dump->status = nl_sock_send__(dump->sock, request,
                                       nl_sock_allocate_seq(dump->sock, 1),
@@ -1725,39 +1734,95 @@  struct nl_pool {
     int n;
 };
 
+struct nl_ns_pool {
+    struct nl_pool pools[MAX_LINKS];
+};
+
 static struct ovs_mutex pool_mutex = OVS_MUTEX_INITIALIZER;
-static struct nl_pool pools[MAX_LINKS] OVS_GUARDED_BY(pool_mutex);
+static struct shash pools OVS_GUARDED_BY(pool_mutex) =
+                                SHASH_INITIALIZER(&pools);
 
 static int
-nl_pool_alloc(int protocol, struct nl_sock **sockp)
+nl_sock_ns_create(const char *netns, int protocol, struct nl_sock **sockp) {
+    int ret, ns_fd, ns_default_fd, err;
+    if (netns) {
+        ns_default_fd = open("/proc/self/ns/net", O_RDONLY);
+        if (ns_default_fd < 0) {
+            VLOG_ERR("something wrong when opening self net fd, %d\n",
+                     errno);
+            return -errno;
+        }
+        char *netns_path = xasprintf("/var/run/netns/%s", netns);
+        ns_fd = open(netns_path, O_RDONLY);
+        free(netns_path);
+        if (ns_fd < 0) {
+            VLOG_ERR("something wrong when opening other net fd, %d\n",
+                     errno);
+            return -errno;
+        }
+        err = setns(ns_fd, CLONE_NEWNET);
+        if (err < 0) {
+            VLOG_ABORT("something wrong during setns to target, %d\n",
+                       errno);
+        }
+        close(ns_fd);
+    }
+    ret = nl_sock_create(protocol, sockp);
+    if (netns) {
+        err = setns(ns_default_fd, CLONE_NEWNET);
+        if (err < 0) {
+            VLOG_ABORT("something wrong during setns to home, %d\n",
+                       errno);
+        }
+        close(ns_default_fd);
+        if (*sockp) {
+            (*sockp)->netns = xstrdup(netns);
+        }
+    }
+    return ret;
+}
+
+static int
+nl_pool_alloc(const char *netns, int protocol, struct nl_sock **sockp)
 {
     struct nl_sock *sock = NULL;
     struct nl_pool *pool;
 
-    ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(pools));
+    ovs_assert(protocol >= 0 && protocol < MAX_LINKS);
 
     ovs_mutex_lock(&pool_mutex);
-    pool = &pools[protocol];
-    if (pool->n > 0) {
-        sock = pool->socks[--pool->n];
+    const char * netns_name = netns ? netns : "";
+
+    struct nl_ns_pool *ns_pools = shash_find_data(&pools, netns_name);
+    if (ns_pools) {
+        pool = &ns_pools->pools[protocol];
+        if (pool->n > 0) {
+            sock = pool->socks[--pool->n];
+        }
+
+        if (sock) {
+            ovs_mutex_unlock(&pool_mutex);
+            *sockp = sock;
+            return 0;
+        }
     }
     ovs_mutex_unlock(&pool_mutex);
-
-    if (sock) {
-        *sockp = sock;
-        return 0;
-    } else {
-        return nl_sock_create(protocol, sockp);
-    }
+    return nl_sock_ns_create(netns, protocol, sockp);
 }
 
 static void
 nl_pool_release(struct nl_sock *sock)
 {
     if (sock) {
-        struct nl_pool *pool = &pools[sock->protocol];
+        const char * netns_name = sock->netns ? sock->netns : "";
 
         ovs_mutex_lock(&pool_mutex);
+        struct nl_ns_pool *ns_pools = shash_find_data(&pools, netns_name);
+        if (!ns_pools) {
+            ns_pools = xzalloc(sizeof(*ns_pools));
+            shash_add(&pools, netns_name, ns_pools);
+        }
+        struct nl_pool *pool = &ns_pools->pools[sock->protocol];
         if (pool->n < ARRAY_SIZE(pool->socks)) {
             pool->socks[pool->n++] = sock;
             sock = NULL;
@@ -1772,6 +1837,9 @@  nl_pool_release(struct nl_sock *sock)
  * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response.  If
  * successful, returns 0.  On failure, returns a positive errno value.
  *
+ * The 'request' is send in the specified netns. If 'netns' is NULL the current
+ * namespace is used.
+ *
  * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
  * reply, which the caller is responsible for freeing with ofpbuf_delete(), and
  * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the kernel's
@@ -1810,13 +1878,13 @@  nl_pool_release(struct nl_sock *sock)
  *         needs to be idempotent.
  */
 int
-nl_transact(int protocol, const struct ofpbuf *request,
+nl_transact(const char *netns, int protocol, const struct ofpbuf *request,
             struct ofpbuf **replyp)
 {
     struct nl_sock *sock;
     int error;
 
-    error = nl_pool_alloc(protocol, &sock);
+    error = nl_pool_alloc(netns, protocol, &sock);
     if (error) {
         if (replyp) {
             *replyp = NULL;
@@ -1851,13 +1919,13 @@  nl_transact(int protocol, const struct ofpbuf *request,
  * nl_transact() for some caveats.
  */
 void
-nl_transact_multiple(int protocol,
+nl_transact_multiple(const char *netns, int protocol,
                      struct nl_transaction **transactions, size_t n)
 {
     struct nl_sock *sock;
     int error;
 
-    error = nl_pool_alloc(protocol, &sock);
+    error = nl_pool_alloc(netns, protocol, &sock);
     if (!error) {
         nl_sock_transact_multiple(sock, transactions, n);
         nl_pool_release(sock);
diff --git a/lib/netlink-socket.h b/lib/netlink-socket.h
index 7852ad052..e4d645a62 100644
--- a/lib/netlink-socket.h
+++ b/lib/netlink-socket.h
@@ -251,9 +251,10 @@  struct nl_transaction {
 };
 
 /* Transactions without an allocated socket. */
-int nl_transact(int protocol, const struct ofpbuf *request,
+int nl_transact(const char *netns, int protocol, const struct ofpbuf *request,
                 struct ofpbuf **replyp);
-void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n);
+void nl_transact_multiple(const char *netns, int protocol,
+                          struct nl_transaction **, size_t n);
 
 /* Table dumping. */
 #define NL_DUMP_BUFSIZE         4096
@@ -270,7 +271,7 @@  struct nl_dump {
     struct ovs_mutex mutex;     /* Protects 'status', synchronizes recv(). */
 };
 
-void nl_dump_start(struct nl_dump *, int protocol,
+void nl_dump_start(const char *netns, struct nl_dump *dump, int protocol,
                    const struct ofpbuf *request);
 bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf);
 int nl_dump_done(struct nl_dump *);
diff --git a/lib/route-table.c b/lib/route-table.c
index c6cb21394..d5d98585e 100644
--- a/lib/route-table.c
+++ b/lib/route-table.c
@@ -156,7 +156,7 @@  route_table_wait(void)
 }
 
 static bool
-route_table_dump_one_table(unsigned char id)
+route_table_dump_one_table(const char *netns, unsigned char id)
 {
     uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
     struct ofpbuf request, reply, buf;
@@ -172,7 +172,7 @@  route_table_dump_one_table(unsigned char id)
     rq_msg->rtm_family = AF_UNSPEC;
     rq_msg->rtm_table = id;
 
-    nl_dump_start(&dump, NETLINK_ROUTE, &request);
+    nl_dump_start(netns, &dump, NETLINK_ROUTE, &request);
     ofpbuf_uninit(&request);
 
     ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
@@ -212,7 +212,7 @@  route_table_reset(void)
     COVERAGE_INC(route_table_dump);
 
     for (size_t i = 0; i < ARRAY_SIZE(tables); i++) {
-        if (!route_table_dump_one_table(tables[i])) {
+        if (!route_table_dump_one_table(NULL, tables[i])) {
             /* Got unfiltered reply, no need to dump further. */
             break;
         }
diff --git a/lib/tc.c b/lib/tc.c
index e55ba3b1b..56f4ca7a0 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -244,7 +244,7 @@  static void request_from_tcf_id(struct tcf_id *id, uint16_t eth_type,
 int
 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
 {
-    int error = nl_transact(NETLINK_ROUTE, request, replyp);
+    int error = nl_transact(NULL, NETLINK_ROUTE, request, replyp);
     ofpbuf_uninit(request);
     return error;
 }
@@ -2349,7 +2349,7 @@  tc_dump_flower_start(struct tcf_id *id, struct nl_dump *dump, bool terse)
         nl_msg_put_unspec(&request, TCA_DUMP_FLAGS, &dump_flags,
                           sizeof dump_flags);
     }
-    nl_dump_start(dump, NETLINK_ROUTE, &request);
+    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
     ofpbuf_uninit(&request);
 
     return 0;
@@ -2361,7 +2361,7 @@  tc_dump_tc_chain_start(struct tcf_id *id, struct nl_dump *dump)
     struct ofpbuf request;
 
     request_from_tcf_id(id, 0, RTM_GETCHAIN, NLM_F_DUMP, &request);
-    nl_dump_start(dump, NETLINK_ROUTE, &request);
+    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
     ofpbuf_uninit(&request);
 
     return 0;
@@ -2380,7 +2380,7 @@  tc_dump_tc_action_start(char *name, struct nl_dump *dump)
     nl_msg_end_nested(&request, offset);
     nl_msg_end_nested(&request, root_offset);
 
-    nl_dump_start(dump, NETLINK_ROUTE, &request);
+    nl_dump_start(NULL, dump, NETLINK_ROUTE, &request);
     ofpbuf_uninit(&request);
 
     return 0;