Message ID | 20191217222754.86810-2-yihung.wei@gmail.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,v7,1/2] netdev-linux: Detect numa node id. | expand |
On 17 Dec 2019, at 23:27, Yi-Hung Wei wrote: > Currently, the AF_XDP socket (XSK) related memory are allocated by > main > thread in the main thread's NUMA domain. With the patch that detects > netdev-linux's NUMA node id, the PMD thread of AF_XDP port will be run > on > the AF_XDP netdev's NUMA domain. If the net device's NUMA domain > is different from the main thread's NUMA domain, we will have two > cross-NUMA memory accesses (netdev <-> memory, memory <-> CPU). > > This patch addresses the aforementioned issue by allocating > the memory in the net device's NUMA domain. Did not try the patch yet, but it looks good. One quick comment below… > Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com> > --- > Documentation/intro/install/afxdp.rst | 2 +- > acinclude.m4 | 5 ++++- > include/sparse/automake.mk | 1 + > include/sparse/numa.h | 27 > +++++++++++++++++++++++++++ > lib/netdev-afxdp.c | 21 ++++++++++++++++++--- > 5 files changed, 51 insertions(+), 5 deletions(-) > create mode 100644 include/sparse/numa.h > > diff --git a/Documentation/intro/install/afxdp.rst > b/Documentation/intro/install/afxdp.rst > index 7b0736c96114..c4685fa7ebac 100644 > --- a/Documentation/intro/install/afxdp.rst > +++ b/Documentation/intro/install/afxdp.rst > @@ -164,7 +164,7 @@ If a test case fails, check the log at:: > > Setup AF_XDP netdev > ------------------- > -Before running OVS with AF_XDP, make sure the libbpf and libelf are > +Before running OVS with AF_XDP, make sure the libbpf, libelf, and > libnuma are > set-up right:: > > ldd vswitchd/ovs-vswitchd > diff --git a/acinclude.m4 b/acinclude.m4 > index 542637ac8cb8..73ed11d701aa 100644 > --- a/acinclude.m4 > +++ b/acinclude.m4 > @@ -286,9 +286,12 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ > AC_CHECK_FUNCS([pthread_spin_lock], [], > [AC_MSG_ERROR([unable to find pthread_spin_lock for AF_XDP > support])]) > > + AC_CHECK_LIB(numa, numa_alloc_onnode, [], > + [AC_MSG_ERROR([unable to find libnuma for AF_XDP support])]) > + > AC_DEFINE([HAVE_AF_XDP], [1], > [Define to 1 if AF_XDP support is available and > enabled.]) > - LIBBPF_LDADD=" -lbpf -lelf" > + LIBBPF_LDADD=" -lbpf -lelf -lnuma" > AC_SUBST([LIBBPF_LDADD]) > > AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [ > diff --git a/include/sparse/automake.mk b/include/sparse/automake.mk > index 073631e8c082..974ad3fe55f7 100644 > --- a/include/sparse/automake.mk > +++ b/include/sparse/automake.mk > @@ -5,6 +5,7 @@ noinst_HEADERS += \ > include/sparse/bits/floatn.h \ > include/sparse/assert.h \ > include/sparse/math.h \ > + include/sparse/numa.h \ > include/sparse/netinet/in.h \ > include/sparse/netinet/ip6.h \ > include/sparse/netpacket/packet.h \ > diff --git a/include/sparse/numa.h b/include/sparse/numa.h > new file mode 100644 > index 000000000000..3691a0eaf729 > --- /dev/null > +++ b/include/sparse/numa.h > @@ -0,0 +1,27 @@ > +/* > + * Copyright (c) 2019 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, > software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions > and > + * limitations under the License. > + */ > + > +#ifndef __CHECKER__ > +#error "Use this header only with sparse. It is not a correct > implementation." > +#endif > + > +/* Avoid sparse warning: non-ANSI function declaration of function" > */ > +#define numa_get_membind_compat() numa_get_membind_compat(void) > +#define numa_get_interleave_mask_compat() > numa_get_interleave_mask_compat(void) > +#define numa_get_run_node_mask_compat() > numa_get_run_node_mask_compat(void) > + > +/* Get actual <numa.h> definitions for us to annotate and build on. > */ > +#include_next<numa.h> > diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c > index 4c1f9c68270a..2d6f739b4b67 100644 > --- a/lib/netdev-afxdp.c > +++ b/lib/netdev-afxdp.c > @@ -26,6 +26,7 @@ > #include <linux/rtnetlink.h> > #include <linux/if_xdp.h> > #include <net/if.h> > +#include <numa.h> > #include <poll.h> > #include <stdlib.h> > #include <sys/resource.h> > @@ -469,7 +470,7 @@ xsk_configure_all(struct netdev *netdev) > { > struct netdev_linux *dev = netdev_linux_cast(netdev); > int i, ifindex, n_rxq, n_txq; > - int qid = 0; > + int qid = 0, err = 0; > > ifindex = linux_get_ifindex(netdev_get_name(netdev)); > > @@ -477,6 +478,14 @@ xsk_configure_all(struct netdev *netdev) > ovs_assert(dev->tx_locks == NULL); > > n_rxq = netdev_n_rxq(netdev); > + > + /* Allocate all the xsk related memory in the netdev's NUMA > domain. */ > + struct bitmask *old_bm = numa_get_membind(); > + struct bitmask *new_bm = numa_allocate_nodemask(); > + netdev_get_numa_id(netdev); > + numa_bitmask_setbit(new_bm, dev->numa_id); Rather than access the device specific structure, why not use the return value? numa_bitmask_setbit(new_bm, netdev_get_numa_id(netdev)); > + numa_set_membind(new_bm); > + > dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks); > > if (dev->xdp_mode == OVS_AF_XDP_MODE_BEST_EFFORT) { > @@ -518,11 +527,17 @@ xsk_configure_all(struct netdev *netdev) > ovs_spin_init(&dev->tx_locks[i]); > } > > - return 0; > + goto out; > > err: > xsk_destroy_all(netdev); > - return EINVAL; > + err = EINVAL; > + > +out: > + numa_set_membind(old_bm); > + numa_bitmask_free(old_bm); > + numa_bitmask_free(new_bm); > + return err; > } > > static void > -- > 2.17.1
On 17.12.2019 23:27, Yi-Hung Wei wrote: > Currently, the AF_XDP socket (XSK) related memory are allocated by main > thread in the main thread's NUMA domain. With the patch that detects > netdev-linux's NUMA node id, the PMD thread of AF_XDP port will be run on > the AF_XDP netdev's NUMA domain. If the net device's NUMA domain > is different from the main thread's NUMA domain, we will have two > cross-NUMA memory accesses (netdev <-> memory, memory <-> CPU). > > This patch addresses the aforementioned issue by allocating > the memory in the net device's NUMA domain. > > Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com> > --- > Documentation/intro/install/afxdp.rst | 2 +- > acinclude.m4 | 5 ++++- > include/sparse/automake.mk | 1 + > include/sparse/numa.h | 27 +++++++++++++++++++++++++++ > lib/netdev-afxdp.c | 21 ++++++++++++++++++--- > 5 files changed, 51 insertions(+), 5 deletions(-) > create mode 100644 include/sparse/numa.h > > diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst > index 7b0736c96114..c4685fa7ebac 100644 > --- a/Documentation/intro/install/afxdp.rst > +++ b/Documentation/intro/install/afxdp.rst > @@ -164,7 +164,7 @@ If a test case fails, check the log at:: > > Setup AF_XDP netdev > ------------------- > -Before running OVS with AF_XDP, make sure the libbpf and libelf are > +Before running OVS with AF_XDP, make sure the libbpf, libelf, and libnuma are > set-up right:: > > ldd vswitchd/ovs-vswitchd > diff --git a/acinclude.m4 b/acinclude.m4 > index 542637ac8cb8..73ed11d701aa 100644 > --- a/acinclude.m4 > +++ b/acinclude.m4 > @@ -286,9 +286,12 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ > AC_CHECK_FUNCS([pthread_spin_lock], [], > [AC_MSG_ERROR([unable to find pthread_spin_lock for AF_XDP support])]) > > + AC_CHECK_LIB(numa, numa_alloc_onnode, [], Please, embrace all arguments with brackets. And you may actually use OVS_FIND_DEPENDENCY() instead. > + [AC_MSG_ERROR([unable to find libnuma for AF_XDP support])]) > + > AC_DEFINE([HAVE_AF_XDP], [1], > [Define to 1 if AF_XDP support is available and enabled.]) > - LIBBPF_LDADD=" -lbpf -lelf" > + LIBBPF_LDADD=" -lbpf -lelf -lnuma" AC_CHECK_LIB should add this library automatically to LIBS. This change should not be needed. > AC_SUBST([LIBBPF_LDADD]) > > AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [ > diff --git a/include/sparse/automake.mk b/include/sparse/automake.mk > index 073631e8c082..974ad3fe55f7 100644 > --- a/include/sparse/automake.mk > +++ b/include/sparse/automake.mk > @@ -5,6 +5,7 @@ noinst_HEADERS += \ > include/sparse/bits/floatn.h \ > include/sparse/assert.h \ > include/sparse/math.h \ > + include/sparse/numa.h \ > include/sparse/netinet/in.h \ > include/sparse/netinet/ip6.h \ > include/sparse/netpacket/packet.h \ > diff --git a/include/sparse/numa.h b/include/sparse/numa.h > new file mode 100644 > index 000000000000..3691a0eaf729 > --- /dev/null > +++ b/include/sparse/numa.h > @@ -0,0 +1,27 @@ > +/* > + * Copyright (c) 2019 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef __CHECKER__ > +#error "Use this header only with sparse. It is not a correct implementation." > +#endif > + > +/* Avoid sparse warning: non-ANSI function declaration of function" */ > +#define numa_get_membind_compat() numa_get_membind_compat(void) > +#define numa_get_interleave_mask_compat() numa_get_interleave_mask_compat(void) > +#define numa_get_run_node_mask_compat() numa_get_run_node_mask_compat(void) > + > +/* Get actual <numa.h> definitions for us to annotate and build on. */ > +#include_next<numa.h> > diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c > index 4c1f9c68270a..2d6f739b4b67 100644 > --- a/lib/netdev-afxdp.c > +++ b/lib/netdev-afxdp.c > @@ -26,6 +26,7 @@ > #include <linux/rtnetlink.h> > #include <linux/if_xdp.h> > #include <net/if.h> > +#include <numa.h> > #include <poll.h> > #include <stdlib.h> > #include <sys/resource.h> > @@ -469,7 +470,7 @@ xsk_configure_all(struct netdev *netdev) > { > struct netdev_linux *dev = netdev_linux_cast(netdev); > int i, ifindex, n_rxq, n_txq; > - int qid = 0; > + int qid = 0, err = 0; > > ifindex = linux_get_ifindex(netdev_get_name(netdev)); > > @@ -477,6 +478,14 @@ xsk_configure_all(struct netdev *netdev) > ovs_assert(dev->tx_locks == NULL); > > n_rxq = netdev_n_rxq(netdev); > + > + /* Allocate all the xsk related memory in the netdev's NUMA domain. */ > + struct bitmask *old_bm = numa_get_membind(); > + struct bitmask *new_bm = numa_allocate_nodemask(); > + netdev_get_numa_id(netdev); Does it work? We're holding dev->mutex here and netdev_get_numa_id() will try to take it too and should fail. Am I missing something? > + numa_bitmask_setbit(new_bm, dev->numa_id); As Eelco said, it's better to use the actual result of get_numa_id() instead of relying on cache population. BTW, it might be good to handle NETDEV_NUMA_UNSPEC for that case even if we can't have it right now. > + numa_set_membind(new_bm); It might be better to use 'preferred' policy instead to avoid assertion if we're out of memory on a certain node. > + > dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks); > > if (dev->xdp_mode == OVS_AF_XDP_MODE_BEST_EFFORT) { > @@ -518,11 +527,17 @@ xsk_configure_all(struct netdev *netdev) > ovs_spin_init(&dev->tx_locks[i]); > } > > - return 0; > + goto out; > > err: > xsk_destroy_all(netdev); > - return EINVAL; > + err = EINVAL; > + > +out: > + numa_set_membind(old_bm); > + numa_bitmask_free(old_bm); > + numa_bitmask_free(new_bm); > + return err; > } > > static void >
On Wed, Dec 18, 2019 at 8:41 AM Ilya Maximets <i.maximets@ovn.org> wrote: > > On 17.12.2019 23:27, Yi-Hung Wei wrote: > > Currently, the AF_XDP socket (XSK) related memory are allocated by main > > thread in the main thread's NUMA domain. With the patch that detects > > netdev-linux's NUMA node id, the PMD thread of AF_XDP port will be run on > > the AF_XDP netdev's NUMA domain. If the net device's NUMA domain > > is different from the main thread's NUMA domain, we will have two > > cross-NUMA memory accesses (netdev <-> memory, memory <-> CPU). > > > > This patch addresses the aforementioned issue by allocating > > the memory in the net device's NUMA domain. > > > > Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com> > > --- > > diff --git a/acinclude.m4 b/acinclude.m4 > > index 542637ac8cb8..73ed11d701aa 100644 > > --- a/acinclude.m4 > > +++ b/acinclude.m4 > > @@ -286,9 +286,12 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ > > AC_CHECK_FUNCS([pthread_spin_lock], [], > > [AC_MSG_ERROR([unable to find pthread_spin_lock for AF_XDP support])]) > > > > + AC_CHECK_LIB(numa, numa_alloc_onnode, [], > > Please, embrace all arguments with brackets. > And you may actually use OVS_FIND_DEPENDENCY() instead. Thanks Ilya for review. Yes, it is much cleaner to use OVS_FIND_DEPENDENCY(). > > diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c > > index 4c1f9c68270a..2d6f739b4b67 100644 > > --- a/lib/netdev-afxdp.c > > +++ b/lib/netdev-afxdp.c > > @@ -477,6 +478,14 @@ xsk_configure_all(struct netdev *netdev) > > ovs_assert(dev->tx_locks == NULL); > > > > n_rxq = netdev_n_rxq(netdev); > > + > > + /* Allocate all the xsk related memory in the netdev's NUMA domain. */ > > + struct bitmask *old_bm = numa_get_membind(); > > + struct bitmask *new_bm = numa_allocate_nodemask(); > > + netdev_get_numa_id(netdev); > > Does it work? > We're holding dev->mutex here and netdev_get_numa_id() will try to > take it too and should fail. Am I missing something? You're right. I moved the numa related logic from xsk_configure_all() to netdev_afxdp_reconfigure() to avoid the locking issue in the next revision. Thanks! > > + numa_bitmask_setbit(new_bm, dev->numa_id); > > As Eelco said, it's better to use the actual result of get_numa_id() > instead of relying on cache population. > BTW, it might be good to handle NETDEV_NUMA_UNSPEC for that case even > if we can't have it right now. Sounds good. > > + numa_set_membind(new_bm); > > It might be better to use 'preferred' policy instead to avoid > assertion if we're out of memory on a certain node. Sure, I will use numa_set_preferred() in the next version. Thanks, -Yi-Hung
diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst index 7b0736c96114..c4685fa7ebac 100644 --- a/Documentation/intro/install/afxdp.rst +++ b/Documentation/intro/install/afxdp.rst @@ -164,7 +164,7 @@ If a test case fails, check the log at:: Setup AF_XDP netdev ------------------- -Before running OVS with AF_XDP, make sure the libbpf and libelf are +Before running OVS with AF_XDP, make sure the libbpf, libelf, and libnuma are set-up right:: ldd vswitchd/ovs-vswitchd diff --git a/acinclude.m4 b/acinclude.m4 index 542637ac8cb8..73ed11d701aa 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -286,9 +286,12 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ AC_CHECK_FUNCS([pthread_spin_lock], [], [AC_MSG_ERROR([unable to find pthread_spin_lock for AF_XDP support])]) + AC_CHECK_LIB(numa, numa_alloc_onnode, [], + [AC_MSG_ERROR([unable to find libnuma for AF_XDP support])]) + AC_DEFINE([HAVE_AF_XDP], [1], [Define to 1 if AF_XDP support is available and enabled.]) - LIBBPF_LDADD=" -lbpf -lelf" + LIBBPF_LDADD=" -lbpf -lelf -lnuma" AC_SUBST([LIBBPF_LDADD]) AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [ diff --git a/include/sparse/automake.mk b/include/sparse/automake.mk index 073631e8c082..974ad3fe55f7 100644 --- a/include/sparse/automake.mk +++ b/include/sparse/automake.mk @@ -5,6 +5,7 @@ noinst_HEADERS += \ include/sparse/bits/floatn.h \ include/sparse/assert.h \ include/sparse/math.h \ + include/sparse/numa.h \ include/sparse/netinet/in.h \ include/sparse/netinet/ip6.h \ include/sparse/netpacket/packet.h \ diff --git a/include/sparse/numa.h b/include/sparse/numa.h new file mode 100644 index 000000000000..3691a0eaf729 --- /dev/null +++ b/include/sparse/numa.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2019 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CHECKER__ +#error "Use this header only with sparse. It is not a correct implementation." +#endif + +/* Avoid sparse warning: non-ANSI function declaration of function" */ +#define numa_get_membind_compat() numa_get_membind_compat(void) +#define numa_get_interleave_mask_compat() numa_get_interleave_mask_compat(void) +#define numa_get_run_node_mask_compat() numa_get_run_node_mask_compat(void) + +/* Get actual <numa.h> definitions for us to annotate and build on. */ +#include_next<numa.h> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c index 4c1f9c68270a..2d6f739b4b67 100644 --- a/lib/netdev-afxdp.c +++ b/lib/netdev-afxdp.c @@ -26,6 +26,7 @@ #include <linux/rtnetlink.h> #include <linux/if_xdp.h> #include <net/if.h> +#include <numa.h> #include <poll.h> #include <stdlib.h> #include <sys/resource.h> @@ -469,7 +470,7 @@ xsk_configure_all(struct netdev *netdev) { struct netdev_linux *dev = netdev_linux_cast(netdev); int i, ifindex, n_rxq, n_txq; - int qid = 0; + int qid = 0, err = 0; ifindex = linux_get_ifindex(netdev_get_name(netdev)); @@ -477,6 +478,14 @@ xsk_configure_all(struct netdev *netdev) ovs_assert(dev->tx_locks == NULL); n_rxq = netdev_n_rxq(netdev); + + /* Allocate all the xsk related memory in the netdev's NUMA domain. */ + struct bitmask *old_bm = numa_get_membind(); + struct bitmask *new_bm = numa_allocate_nodemask(); + netdev_get_numa_id(netdev); + numa_bitmask_setbit(new_bm, dev->numa_id); + numa_set_membind(new_bm); + dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks); if (dev->xdp_mode == OVS_AF_XDP_MODE_BEST_EFFORT) { @@ -518,11 +527,17 @@ xsk_configure_all(struct netdev *netdev) ovs_spin_init(&dev->tx_locks[i]); } - return 0; + goto out; err: xsk_destroy_all(netdev); - return EINVAL; + err = EINVAL; + +out: + numa_set_membind(old_bm); + numa_bitmask_free(old_bm); + numa_bitmask_free(new_bm); + return err; } static void
Currently, the AF_XDP socket (XSK) related memory are allocated by main thread in the main thread's NUMA domain. With the patch that detects netdev-linux's NUMA node id, the PMD thread of AF_XDP port will be run on the AF_XDP netdev's NUMA domain. If the net device's NUMA domain is different from the main thread's NUMA domain, we will have two cross-NUMA memory accesses (netdev <-> memory, memory <-> CPU). This patch addresses the aforementioned issue by allocating the memory in the net device's NUMA domain. Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com> --- Documentation/intro/install/afxdp.rst | 2 +- acinclude.m4 | 5 ++++- include/sparse/automake.mk | 1 + include/sparse/numa.h | 27 +++++++++++++++++++++++++++ lib/netdev-afxdp.c | 21 ++++++++++++++++++--- 5 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 include/sparse/numa.h