diff mbox series

[bpf-next,v4,1/2] libbpf: add support for using AF_XDP sockets

Message ID 1549631126-29067-2-git-send-email-magnus.karlsson@intel.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series libbpf: adding AF_XDP support | expand

Commit Message

Magnus Karlsson Feb. 8, 2019, 1:05 p.m. UTC
This commit adds AF_XDP support to libbpf. The main reason for this is
to facilitate writing applications that use AF_XDP by offering
higher-level APIs that hide many of the details of the AF_XDP
uapi. This is in the same vein as libbpf facilitates XDP adoption by
offering easy-to-use higher level interfaces of XDP
functionality. Hopefully this will facilitate adoption of AF_XDP, make
applications using it simpler and smaller, and finally also make it
possible for applications to benefit from optimizations in the AF_XDP
user space access code. Previously, people just copied and pasted the
code from the sample application into their application, which is not
desirable.

The interface is composed of two parts:

* Low-level access interface to the four rings and the packet
* High-level control plane interface for creating and setting
  up umems and af_xdp sockets as well as a simple XDP program.

Tested-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 tools/include/uapi/linux/if_xdp.h |  78 ++++
 tools/lib/bpf/Build               |   2 +-
 tools/lib/bpf/Makefile            |   5 +-
 tools/lib/bpf/README.rst          |  11 +-
 tools/lib/bpf/libbpf.map          |   7 +
 tools/lib/bpf/xsk.c               | 742 ++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/xsk.h               | 205 +++++++++++
 7 files changed, 1047 insertions(+), 3 deletions(-)
 create mode 100644 tools/include/uapi/linux/if_xdp.h
 create mode 100644 tools/lib/bpf/xsk.c
 create mode 100644 tools/lib/bpf/xsk.h

Comments

Daniel Borkmann Feb. 15, 2019, 4:37 p.m. UTC | #1
On 02/08/2019 02:05 PM, Magnus Karlsson wrote:
> This commit adds AF_XDP support to libbpf. The main reason for this is
> to facilitate writing applications that use AF_XDP by offering
> higher-level APIs that hide many of the details of the AF_XDP
> uapi. This is in the same vein as libbpf facilitates XDP adoption by
> offering easy-to-use higher level interfaces of XDP
> functionality. Hopefully this will facilitate adoption of AF_XDP, make
> applications using it simpler and smaller, and finally also make it
> possible for applications to benefit from optimizations in the AF_XDP
> user space access code. Previously, people just copied and pasted the
> code from the sample application into their application, which is not
> desirable.
> 
> The interface is composed of two parts:
> 
> * Low-level access interface to the four rings and the packet
> * High-level control plane interface for creating and setting
>   up umems and af_xdp sockets as well as a simple XDP program.
> 
> Tested-by: Björn Töpel <bjorn.topel@intel.com>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
[...]
> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
> new file mode 100644
> index 0000000..a982a76
> --- /dev/null
> +++ b/tools/lib/bpf/xsk.c
> @@ -0,0 +1,742 @@
> +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> +
> +/*
> + * AF_XDP user-space access library.
> + *
> + * Copyright(c) 2018 - 2019 Intel Corporation.
> + *
> + * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
> + */
> +
> +#include <errno.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <arpa/inet.h>
> +#include <asm/barrier.h>
> +#include <linux/compiler.h>
> +#include <linux/filter.h>
> +#include <linux/if_ether.h>
> +#include <linux/if_link.h>
> +#include <linux/if_packet.h>
> +#include <linux/if_xdp.h>
> +#include <linux/rtnetlink.h>
> +#include <net/if.h>
> +#include <sys/mman.h>
> +#include <sys/socket.h>
> +#include <sys/types.h>
> +
> +#include "bpf.h"
> +#include "libbpf.h"
> +#include "libbpf_util.h"
> +#include "nlattr.h"
> +#include "xsk.h"
> +
> +#ifndef SOL_XDP
> + #define SOL_XDP 283
> +#endif
> +
> +#ifndef AF_XDP
> + #define AF_XDP 44
> +#endif
> +
> +#ifndef PF_XDP
> + #define PF_XDP AF_XDP
> +#endif
> +
> +struct xsk_umem {
> +	struct xsk_ring_prod *fill;
> +	struct xsk_ring_cons *comp;
> +	char *umem_area;
> +	struct xsk_umem_config config;
> +	int fd;
> +	int refcount;
> +};
> +
> +struct xsk_socket {
> +	struct xsk_ring_cons *rx;
> +	struct xsk_ring_prod *tx;
> +	__u64 outstanding_tx;
> +	struct xsk_umem *umem;
> +	struct xsk_socket_config config;
> +	int fd;
> +	int xsks_map;
> +	int ifindex;
> +	int prog_fd;
> +	int qidconf_map_fd;
> +	int xsks_map_fd;
> +	__u32 queue_id;
> +};
> +
> +struct xsk_nl_info {
> +	bool xdp_prog_attached;
> +	int ifindex;
> +	int fd;
> +};
> +
> +#define MAX_QUEUES 128

Why is this a fixed constant here, shouldn't this be dynamic due to being NIC
specific anyway?

[...]
> +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
> +{
> +	return &((char *)(umem->umem_area))[addr];
> +}

There's also a xsk_umem__get_data_raw() doing the same. Why having both, resp.
when to choose which? ;)

> +int xsk_umem__fd(const struct xsk_umem *umem)
> +{
> +	return umem ? umem->fd : -EINVAL;
> +}
> +
> +int xsk_socket__fd(const struct xsk_socket *xsk)
> +{
> +	return xsk ? xsk->fd : -EINVAL;
> +}
> +
> +static bool xsk_page_aligned(void *buffer)
> +{
> +	unsigned long addr = (unsigned long)buffer;
> +
> +	return !(addr & (getpagesize() - 1));
> +}
> +
> +static void xsk_set_umem_config(struct xsk_umem_config *cfg,
> +				const struct xsk_umem_config *usr_cfg)
> +{
> +	if (!usr_cfg) {
> +		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> +		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> +		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
> +		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
> +		return;
> +	}
> +
> +	cfg->fill_size = usr_cfg->fill_size;
> +	cfg->comp_size = usr_cfg->comp_size;
> +	cfg->frame_size = usr_cfg->frame_size;
> +	cfg->frame_headroom = usr_cfg->frame_headroom;

Just optional nit, might be a bit nicer to have it in this form:

	cfg->fill_size = usr_cfg ? usr_cfg->fill_size :
		         XSK_RING_PROD__DEFAULT_NUM_DESCS;


> +}
> +
> +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
> +				      const struct xsk_socket_config *usr_cfg)
> +{
> +	if (!usr_cfg) {
> +		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> +		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> +		cfg->libbpf_flags = 0;
> +		cfg->xdp_flags = 0;
> +		cfg->bind_flags = 0;
> +		return;
> +	}
> +
> +	cfg->rx_size = usr_cfg->rx_size;
> +	cfg->tx_size = usr_cfg->tx_size;
> +	cfg->libbpf_flags = usr_cfg->libbpf_flags;
> +	cfg->xdp_flags = usr_cfg->xdp_flags;
> +	cfg->bind_flags = usr_cfg->bind_flags;

(Ditto)

> +}
> +
> +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
> +		     struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
> +		     const struct xsk_umem_config *usr_config)
> +{
> +	struct xdp_mmap_offsets off;
> +	struct xdp_umem_reg mr;
> +	struct xsk_umem *umem;
> +	socklen_t optlen;
> +	void *map;
> +	int err;
> +
> +	if (!umem_area || !umem_ptr || !fill || !comp)
> +		return -EFAULT;
> +	if (!size && !xsk_page_aligned(umem_area))
> +		return -EINVAL;
> +
> +	umem = calloc(1, sizeof(*umem));
> +	if (!umem)
> +		return -ENOMEM;
> +
> +	umem->fd = socket(AF_XDP, SOCK_RAW, 0);
> +	if (umem->fd < 0) {
> +		err = -errno;
> +		goto out_umem_alloc;
> +	}
> +
> +	umem->umem_area = umem_area;
> +	xsk_set_umem_config(&umem->config, usr_config);
> +
> +	mr.addr = (uintptr_t)umem_area;
> +	mr.len = size;
> +	mr.chunk_size = umem->config.frame_size;
> +	mr.headroom = umem->config.frame_headroom;
> +
> +	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
> +	if (err) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
> +			 &umem->config.fill_size,
> +			 sizeof(umem->config.fill_size));
> +	if (err) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
> +			 &umem->config.comp_size,
> +			 sizeof(umem->config.comp_size));
> +	if (err) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +
> +	optlen = sizeof(off);
> +	err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
> +	if (err) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +
> +	map = xsk_mmap(NULL, off.fr.desc +
> +		       umem->config.fill_size * sizeof(__u64),
> +		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> +		       umem->fd, XDP_UMEM_PGOFF_FILL_RING);
> +	if (map == MAP_FAILED) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +
> +	umem->fill = fill;
> +	fill->mask = umem->config.fill_size - 1;
> +	fill->size = umem->config.fill_size;
> +	fill->producer = map + off.fr.producer;
> +	fill->consumer = map + off.fr.consumer;
> +	fill->ring = map + off.fr.desc;
> +	fill->cached_cons = umem->config.fill_size;
> +
> +	map = xsk_mmap(NULL,
> +		       off.cr.desc + umem->config.comp_size * sizeof(__u64),
> +		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> +		       umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
> +	if (map == MAP_FAILED) {
> +		err = -errno;
> +		goto out_mmap;
> +	}
> +
> +	umem->comp = comp;
> +	comp->mask = umem->config.comp_size - 1;
> +	comp->size = umem->config.comp_size;
> +	comp->producer = map + off.cr.producer;
> +	comp->consumer = map + off.cr.consumer;
> +	comp->ring = map + off.cr.desc;
> +
> +	*umem_ptr = umem;
> +	return 0;
> +
> +out_mmap:
> +	munmap(umem->fill,
> +	       off.fr.desc + umem->config.fill_size * sizeof(__u64));
> +out_socket:
> +	close(umem->fd);
> +out_umem_alloc:
> +	free(umem);
> +	return err;
> +}
> +
> +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
> +{
> +	struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
> +	struct xsk_nl_info *nl_info = cookie;
> +	struct ifinfomsg *ifinfo = msg;
> +	unsigned char mode;
> +	int err;
> +
> +	if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
> +		return 0;
> +
> +	if (!tb[IFLA_XDP])
> +		return 0;
> +
> +	err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
> +				      NULL);
> +	if (err)
> +		return err;
> +
> +	if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
> +		return 0;
> +
> +	mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
> +	if (mode == XDP_ATTACHED_NONE)
> +		return 0;
> +
> +	nl_info->xdp_prog_attached = true;
> +	nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);

Hm, I don't think this works if I read the intention of this helper correctly.

IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the
above is a bug.

We also have bpf_get_link_xdp_id(). This should probably just be reused in
this context here.

> +	return 0;
> +}
> +
> +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
> +{
> +	struct xsk_nl_info nl_info;
> +	unsigned int nl_pid;
> +	char err_buf[256];
> +	int sock, err;
> +
> +	sock = libbpf_netlink_open(&nl_pid);
> +	if (sock < 0)
> +		return false;
> +
> +	nl_info.xdp_prog_attached = false;
> +	nl_info.ifindex = xsk->ifindex;
> +	nl_info.fd = -1;
> +
> +	err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
> +	if (err) {
> +		libbpf_strerror(err, err_buf, sizeof(err_buf));
> +		pr_warning("Error:\n%s\n", err_buf);
> +		close(sock);
> +		return false;
> +	}
> +
> +	close(sock);
> +	xsk->prog_fd = nl_info.fd;
> +	return nl_info.xdp_prog_attached;
> +}

(See bpf_get_link_xdp_id().)

> +
> +static int xsk_load_xdp_prog(struct xsk_socket *xsk)
> +{
> +	char bpf_log_buf[BPF_LOG_BUF_SIZE];
> +	int err, prog_fd;
> +
> +	/* This is the C-program:
> +	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
> +	 * {
> +	 *     int *qidconf, index = ctx->rx_queue_index;
[...]
> +
> +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
> +		       __u32 queue_id, struct xsk_umem *umem,
> +		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
> +		       const struct xsk_socket_config *usr_config)
> +{
> +	struct sockaddr_xdp sxdp = {};
> +	struct xdp_mmap_offsets off;
> +	struct xsk_socket *xsk;
> +	socklen_t optlen;
> +	void *map;
> +	int err;
> +
> +	if (!umem || !xsk_ptr || !rx || !tx)
> +		return -EFAULT;
> +
> +	if (umem->refcount) {
> +		pr_warning("Error: shared umems not supported by libbpf.\n");
> +		return -EBUSY;
> +	}
> +
> +	xsk = calloc(1, sizeof(*xsk));
> +	if (!xsk)
> +		return -ENOMEM;
> +
> +	if (umem->refcount++ > 0) {

Should this refcount rather be atomic actually?

> +		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
> +		if (xsk->fd < 0) {
> +			err = -errno;
> +			goto out_xsk_alloc;
> +		}
> +	} else {
> +		xsk->fd = umem->fd;
> +	}
> +
> +	xsk->outstanding_tx = 0;
> +	xsk->queue_id = queue_id;
> +	xsk->umem = umem;
> +	xsk->ifindex = if_nametoindex(ifname);
> +	if (!xsk->ifindex) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +
> +	xsk_set_xdp_socket_config(&xsk->config, usr_config);
[...]
Magnus Karlsson Feb. 18, 2019, 8:59 a.m. UTC | #2
On Fri, Feb 15, 2019 at 6:09 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 02/08/2019 02:05 PM, Magnus Karlsson wrote:
> > This commit adds AF_XDP support to libbpf. The main reason for this is
> > to facilitate writing applications that use AF_XDP by offering
> > higher-level APIs that hide many of the details of the AF_XDP
> > uapi. This is in the same vein as libbpf facilitates XDP adoption by
> > offering easy-to-use higher level interfaces of XDP
> > functionality. Hopefully this will facilitate adoption of AF_XDP, make
> > applications using it simpler and smaller, and finally also make it
> > possible for applications to benefit from optimizations in the AF_XDP
> > user space access code. Previously, people just copied and pasted the
> > code from the sample application into their application, which is not
> > desirable.
> >
> > The interface is composed of two parts:
> >
> > * Low-level access interface to the four rings and the packet
> > * High-level control plane interface for creating and setting
> >   up umems and af_xdp sockets as well as a simple XDP program.
> >
> > Tested-by: Björn Töpel <bjorn.topel@intel.com>
> > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> [...]
> > diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
> > new file mode 100644
> > index 0000000..a982a76
> > --- /dev/null
> > +++ b/tools/lib/bpf/xsk.c
> > @@ -0,0 +1,742 @@
> > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> > +
> > +/*
> > + * AF_XDP user-space access library.
> > + *
> > + * Copyright(c) 2018 - 2019 Intel Corporation.
> > + *
> > + * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
> > + */
> > +
> > +#include <errno.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +#include <unistd.h>
> > +#include <arpa/inet.h>
> > +#include <asm/barrier.h>
> > +#include <linux/compiler.h>
> > +#include <linux/filter.h>
> > +#include <linux/if_ether.h>
> > +#include <linux/if_link.h>
> > +#include <linux/if_packet.h>
> > +#include <linux/if_xdp.h>
> > +#include <linux/rtnetlink.h>
> > +#include <net/if.h>
> > +#include <sys/mman.h>
> > +#include <sys/socket.h>
> > +#include <sys/types.h>
> > +
> > +#include "bpf.h"
> > +#include "libbpf.h"
> > +#include "libbpf_util.h"
> > +#include "nlattr.h"
> > +#include "xsk.h"
> > +
> > +#ifndef SOL_XDP
> > + #define SOL_XDP 283
> > +#endif
> > +
> > +#ifndef AF_XDP
> > + #define AF_XDP 44
> > +#endif
> > +
> > +#ifndef PF_XDP
> > + #define PF_XDP AF_XDP
> > +#endif
> > +
> > +struct xsk_umem {
> > +     struct xsk_ring_prod *fill;
> > +     struct xsk_ring_cons *comp;
> > +     char *umem_area;
> > +     struct xsk_umem_config config;
> > +     int fd;
> > +     int refcount;
> > +};
> > +
> > +struct xsk_socket {
> > +     struct xsk_ring_cons *rx;
> > +     struct xsk_ring_prod *tx;
> > +     __u64 outstanding_tx;
> > +     struct xsk_umem *umem;
> > +     struct xsk_socket_config config;
> > +     int fd;
> > +     int xsks_map;
> > +     int ifindex;
> > +     int prog_fd;
> > +     int qidconf_map_fd;
> > +     int xsks_map_fd;
> > +     __u32 queue_id;
> > +};
> > +
> > +struct xsk_nl_info {
> > +     bool xdp_prog_attached;
> > +     int ifindex;
> > +     int fd;
> > +};
> > +
> > +#define MAX_QUEUES 128
>
> Why is this a fixed constant here, shouldn't this be dynamic due to being NIC
> specific anyway?

It was only here for simplicity. If a NIC had more queues, it would
require a recompile of the lib. Obviously, not desirable in a distro.
What I could do is to read the max "combined" queues (pre-set maximum
in the ethtool output) from the same interface as ethool uses and size
the array after that. Or is there a simpler way? What to do if the NIC
does not have a "combined", or is there no such NIC (seems the common
HW ones set this)?

> [...]
> > +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
> > +{
> > +     return &((char *)(umem->umem_area))[addr];
> > +}
>
> There's also a xsk_umem__get_data_raw() doing the same. Why having both, resp.
> when to choose which? ;)

There is enough to have the xsk_umem__get_data_raw() function.
xsk_umem__get_data() is just a convenience function for which the
application does not have to store the beginning of the umem. But as
the application always has to provide this anyway in the
xsk_umem__create() function, it might as well store this pointer. I
will delete xsk_umem__get_data() and rename xsk_umem__get_data_raw()
to xsk_umem__get_data().

> > +int xsk_umem__fd(const struct xsk_umem *umem)
> > +{
> > +     return umem ? umem->fd : -EINVAL;
> > +}
> > +
> > +int xsk_socket__fd(const struct xsk_socket *xsk)
> > +{
> > +     return xsk ? xsk->fd : -EINVAL;
> > +}
> > +
> > +static bool xsk_page_aligned(void *buffer)
> > +{
> > +     unsigned long addr = (unsigned long)buffer;
> > +
> > +     return !(addr & (getpagesize() - 1));
> > +}
> > +
> > +static void xsk_set_umem_config(struct xsk_umem_config *cfg,
> > +                             const struct xsk_umem_config *usr_cfg)
> > +{
> > +     if (!usr_cfg) {
> > +             cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> > +             cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> > +             cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
> > +             cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
> > +             return;
> > +     }
> > +
> > +     cfg->fill_size = usr_cfg->fill_size;
> > +     cfg->comp_size = usr_cfg->comp_size;
> > +     cfg->frame_size = usr_cfg->frame_size;
> > +     cfg->frame_headroom = usr_cfg->frame_headroom;
>
> Just optional nit, might be a bit nicer to have it in this form:
>
>         cfg->fill_size = usr_cfg ? usr_cfg->fill_size :
>                          XSK_RING_PROD__DEFAULT_NUM_DESCS;

I actually think the current form is clearer when there are multiple
lines. If there was only one line, I would agree with you.

> > +}
> > +
> > +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
> > +                                   const struct xsk_socket_config *usr_cfg)
> > +{
> > +     if (!usr_cfg) {
> > +             cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> > +             cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> > +             cfg->libbpf_flags = 0;
> > +             cfg->xdp_flags = 0;
> > +             cfg->bind_flags = 0;
> > +             return;
> > +     }
> > +
> > +     cfg->rx_size = usr_cfg->rx_size;
> > +     cfg->tx_size = usr_cfg->tx_size;
> > +     cfg->libbpf_flags = usr_cfg->libbpf_flags;
> > +     cfg->xdp_flags = usr_cfg->xdp_flags;
> > +     cfg->bind_flags = usr_cfg->bind_flags;
>
> (Ditto)
>
> > +}
> > +
> > +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
> > +                  struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
> > +                  const struct xsk_umem_config *usr_config)
> > +{
> > +     struct xdp_mmap_offsets off;
> > +     struct xdp_umem_reg mr;
> > +     struct xsk_umem *umem;
> > +     socklen_t optlen;
> > +     void *map;
> > +     int err;
> > +
> > +     if (!umem_area || !umem_ptr || !fill || !comp)
> > +             return -EFAULT;
> > +     if (!size && !xsk_page_aligned(umem_area))
> > +             return -EINVAL;
> > +
> > +     umem = calloc(1, sizeof(*umem));
> > +     if (!umem)
> > +             return -ENOMEM;
> > +
> > +     umem->fd = socket(AF_XDP, SOCK_RAW, 0);
> > +     if (umem->fd < 0) {
> > +             err = -errno;
> > +             goto out_umem_alloc;
> > +     }
> > +
> > +     umem->umem_area = umem_area;
> > +     xsk_set_umem_config(&umem->config, usr_config);
> > +
> > +     mr.addr = (uintptr_t)umem_area;
> > +     mr.len = size;
> > +     mr.chunk_size = umem->config.frame_size;
> > +     mr.headroom = umem->config.frame_headroom;
> > +
> > +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
> > +     if (err) {
> > +             err = -errno;
> > +             goto out_socket;
> > +     }
> > +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
> > +                      &umem->config.fill_size,
> > +                      sizeof(umem->config.fill_size));
> > +     if (err) {
> > +             err = -errno;
> > +             goto out_socket;
> > +     }
> > +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
> > +                      &umem->config.comp_size,
> > +                      sizeof(umem->config.comp_size));
> > +     if (err) {
> > +             err = -errno;
> > +             goto out_socket;
> > +     }
> > +
> > +     optlen = sizeof(off);
> > +     err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
> > +     if (err) {
> > +             err = -errno;
> > +             goto out_socket;
> > +     }
> > +
> > +     map = xsk_mmap(NULL, off.fr.desc +
> > +                    umem->config.fill_size * sizeof(__u64),
> > +                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> > +                    umem->fd, XDP_UMEM_PGOFF_FILL_RING);
> > +     if (map == MAP_FAILED) {
> > +             err = -errno;
> > +             goto out_socket;
> > +     }
> > +
> > +     umem->fill = fill;
> > +     fill->mask = umem->config.fill_size - 1;
> > +     fill->size = umem->config.fill_size;
> > +     fill->producer = map + off.fr.producer;
> > +     fill->consumer = map + off.fr.consumer;
> > +     fill->ring = map + off.fr.desc;
> > +     fill->cached_cons = umem->config.fill_size;
> > +
> > +     map = xsk_mmap(NULL,
> > +                    off.cr.desc + umem->config.comp_size * sizeof(__u64),
> > +                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> > +                    umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
> > +     if (map == MAP_FAILED) {
> > +             err = -errno;
> > +             goto out_mmap;
> > +     }
> > +
> > +     umem->comp = comp;
> > +     comp->mask = umem->config.comp_size - 1;
> > +     comp->size = umem->config.comp_size;
> > +     comp->producer = map + off.cr.producer;
> > +     comp->consumer = map + off.cr.consumer;
> > +     comp->ring = map + off.cr.desc;
> > +
> > +     *umem_ptr = umem;
> > +     return 0;
> > +
> > +out_mmap:
> > +     munmap(umem->fill,
> > +            off.fr.desc + umem->config.fill_size * sizeof(__u64));
> > +out_socket:
> > +     close(umem->fd);
> > +out_umem_alloc:
> > +     free(umem);
> > +     return err;
> > +}
> > +
> > +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
> > +{
> > +     struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
> > +     struct xsk_nl_info *nl_info = cookie;
> > +     struct ifinfomsg *ifinfo = msg;
> > +     unsigned char mode;
> > +     int err;
> > +
> > +     if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
> > +             return 0;
> > +
> > +     if (!tb[IFLA_XDP])
> > +             return 0;
> > +
> > +     err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
> > +                                   NULL);
> > +     if (err)
> > +             return err;
> > +
> > +     if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
> > +             return 0;
> > +
> > +     mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
> > +     if (mode == XDP_ATTACHED_NONE)
> > +             return 0;
> > +
> > +     nl_info->xdp_prog_attached = true;
> > +     nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);
>
> Hm, I don't think this works if I read the intention of this helper correctly.
>
> IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the
> above is a bug.
>
> We also have bpf_get_link_xdp_id(). This should probably just be reused in
> this context here.

If bpf_get_link_xdp_id() will fit my bill, I will happily use it. I
will check it out and hopefully I can drop all this code. Thanks.

> > +     return 0;
> > +}
> > +
> > +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
> > +{
> > +     struct xsk_nl_info nl_info;
> > +     unsigned int nl_pid;
> > +     char err_buf[256];
> > +     int sock, err;
> > +
> > +     sock = libbpf_netlink_open(&nl_pid);
> > +     if (sock < 0)
> > +             return false;
> > +
> > +     nl_info.xdp_prog_attached = false;
> > +     nl_info.ifindex = xsk->ifindex;
> > +     nl_info.fd = -1;
> > +
> > +     err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
> > +     if (err) {
> > +             libbpf_strerror(err, err_buf, sizeof(err_buf));
> > +             pr_warning("Error:\n%s\n", err_buf);
> > +             close(sock);
> > +             return false;
> > +     }
> > +
> > +     close(sock);
> > +     xsk->prog_fd = nl_info.fd;
> > +     return nl_info.xdp_prog_attached;
> > +}
>
> (See bpf_get_link_xdp_id().)
>
> > +
> > +static int xsk_load_xdp_prog(struct xsk_socket *xsk)
> > +{
> > +     char bpf_log_buf[BPF_LOG_BUF_SIZE];
> > +     int err, prog_fd;
> > +
> > +     /* This is the C-program:
> > +      * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
> > +      * {
> > +      *     int *qidconf, index = ctx->rx_queue_index;
> [...]
> > +
> > +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
> > +                    __u32 queue_id, struct xsk_umem *umem,
> > +                    struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
> > +                    const struct xsk_socket_config *usr_config)
> > +{
> > +     struct sockaddr_xdp sxdp = {};
> > +     struct xdp_mmap_offsets off;
> > +     struct xsk_socket *xsk;
> > +     socklen_t optlen;
> > +     void *map;
> > +     int err;
> > +
> > +     if (!umem || !xsk_ptr || !rx || !tx)
> > +             return -EFAULT;
> > +
> > +     if (umem->refcount) {
> > +             pr_warning("Error: shared umems not supported by libbpf.\n");
> > +             return -EBUSY;
> > +     }
> > +
> > +     xsk = calloc(1, sizeof(*xsk));
> > +     if (!xsk)
> > +             return -ENOMEM;
> > +
> > +     if (umem->refcount++ > 0) {
>
> Should this refcount rather be atomic actually?

Neither our config nor data plane interfaces are reentrant for
performance reasons. Any concurrency has to be handled explicitly on
the application level. This so that it only penalizes apps that really
need this.

Thanks for all your reviews: Magnus

> > +             xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
> > +             if (xsk->fd < 0) {
> > +                     err = -errno;
> > +                     goto out_xsk_alloc;
> > +             }
> > +     } else {
> > +             xsk->fd = umem->fd;
> > +     }
> > +
> > +     xsk->outstanding_tx = 0;
> > +     xsk->queue_id = queue_id;
> > +     xsk->umem = umem;
> > +     xsk->ifindex = if_nametoindex(ifname);
> > +     if (!xsk->ifindex) {
> > +             err = -errno;
> > +             goto out_socket;
> > +     }
> > +
> > +     xsk_set_xdp_socket_config(&xsk->config, usr_config);
> [...]
Maciej Fijalkowski Feb. 18, 2019, 11:21 a.m. UTC | #3
On Mon, 18 Feb 2019 09:59:30 +0100
Magnus Karlsson <magnus.karlsson@gmail.com> wrote:

> On Fri, Feb 15, 2019 at 6:09 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >
> > On 02/08/2019 02:05 PM, Magnus Karlsson wrote:  
> > > This commit adds AF_XDP support to libbpf. The main reason for this is
> > > to facilitate writing applications that use AF_XDP by offering
> > > higher-level APIs that hide many of the details of the AF_XDP
> > > uapi. This is in the same vein as libbpf facilitates XDP adoption by
> > > offering easy-to-use higher level interfaces of XDP
> > > functionality. Hopefully this will facilitate adoption of AF_XDP, make
> > > applications using it simpler and smaller, and finally also make it
> > > possible for applications to benefit from optimizations in the AF_XDP
> > > user space access code. Previously, people just copied and pasted the
> > > code from the sample application into their application, which is not
> > > desirable.
> > >
> > > The interface is composed of two parts:
> > >
> > > * Low-level access interface to the four rings and the packet
> > > * High-level control plane interface for creating and setting
> > >   up umems and af_xdp sockets as well as a simple XDP program.
> > >
> > > Tested-by: Björn Töpel <bjorn.topel@intel.com>
> > > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>  
> > [...]  
> > > diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
> > > new file mode 100644
> > > index 0000000..a982a76
> > > --- /dev/null
> > > +++ b/tools/lib/bpf/xsk.c
> > > @@ -0,0 +1,742 @@
> > > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> > > +
> > > +/*
> > > + * AF_XDP user-space access library.
> > > + *
> > > + * Copyright(c) 2018 - 2019 Intel Corporation.
> > > + *
> > > + * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
> > > + */
> > > +
> > > +#include <errno.h>
> > > +#include <stdlib.h>
> > > +#include <string.h>
> > > +#include <unistd.h>
> > > +#include <arpa/inet.h>
> > > +#include <asm/barrier.h>
> > > +#include <linux/compiler.h>
> > > +#include <linux/filter.h>
> > > +#include <linux/if_ether.h>
> > > +#include <linux/if_link.h>
> > > +#include <linux/if_packet.h>
> > > +#include <linux/if_xdp.h>
> > > +#include <linux/rtnetlink.h>
> > > +#include <net/if.h>
> > > +#include <sys/mman.h>
> > > +#include <sys/socket.h>
> > > +#include <sys/types.h>
> > > +
> > > +#include "bpf.h"
> > > +#include "libbpf.h"
> > > +#include "libbpf_util.h"
> > > +#include "nlattr.h"
> > > +#include "xsk.h"
> > > +
> > > +#ifndef SOL_XDP
> > > + #define SOL_XDP 283
> > > +#endif
> > > +
> > > +#ifndef AF_XDP
> > > + #define AF_XDP 44
> > > +#endif
> > > +
> > > +#ifndef PF_XDP
> > > + #define PF_XDP AF_XDP
> > > +#endif
> > > +
> > > +struct xsk_umem {
> > > +     struct xsk_ring_prod *fill;
> > > +     struct xsk_ring_cons *comp;
> > > +     char *umem_area;
> > > +     struct xsk_umem_config config;
> > > +     int fd;
> > > +     int refcount;
> > > +};
> > > +
> > > +struct xsk_socket {
> > > +     struct xsk_ring_cons *rx;
> > > +     struct xsk_ring_prod *tx;
> > > +     __u64 outstanding_tx;
> > > +     struct xsk_umem *umem;
> > > +     struct xsk_socket_config config;
> > > +     int fd;
> > > +     int xsks_map;
> > > +     int ifindex;
> > > +     int prog_fd;
> > > +     int qidconf_map_fd;
> > > +     int xsks_map_fd;
> > > +     __u32 queue_id;
> > > +};
> > > +
> > > +struct xsk_nl_info {
> > > +     bool xdp_prog_attached;
> > > +     int ifindex;
> > > +     int fd;
> > > +};
> > > +
> > > +#define MAX_QUEUES 128  
> >
> > Why is this a fixed constant here, shouldn't this be dynamic due to being NIC
> > specific anyway?  
> 
> It was only here for simplicity. If a NIC had more queues, it would
> require a recompile of the lib. Obviously, not desirable in a distro.
> What I could do is to read the max "combined" queues (pre-set maximum
> in the ethtool output) from the same interface as ethool uses and size
> the array after that. Or is there a simpler way? What to do if the NIC
> does not have a "combined", or is there no such NIC (seems the common
> HW ones set this)?
> 
> > [...]  
> > > +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
> > > +{
> > > +     return &((char *)(umem->umem_area))[addr];
> > > +}  
> >
> > There's also a xsk_umem__get_data_raw() doing the same. Why having both, resp.
> > when to choose which? ;)  
> 
> There is enough to have the xsk_umem__get_data_raw() function.
> xsk_umem__get_data() is just a convenience function for which the
> application does not have to store the beginning of the umem. But as
> the application always has to provide this anyway in the
> xsk_umem__create() function, it might as well store this pointer. I
> will delete xsk_umem__get_data() and rename xsk_umem__get_data_raw()
> to xsk_umem__get_data().
> 
> > > +int xsk_umem__fd(const struct xsk_umem *umem)
> > > +{
> > > +     return umem ? umem->fd : -EINVAL;
> > > +}
> > > +
> > > +int xsk_socket__fd(const struct xsk_socket *xsk)
> > > +{
> > > +     return xsk ? xsk->fd : -EINVAL;
> > > +}
> > > +
> > > +static bool xsk_page_aligned(void *buffer)
> > > +{
> > > +     unsigned long addr = (unsigned long)buffer;
> > > +
> > > +     return !(addr & (getpagesize() - 1));
> > > +}
> > > +
> > > +static void xsk_set_umem_config(struct xsk_umem_config *cfg,
> > > +                             const struct xsk_umem_config *usr_cfg)
> > > +{
> > > +     if (!usr_cfg) {
> > > +             cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> > > +             cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> > > +             cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
> > > +             cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
> > > +             return;
> > > +     }
> > > +
> > > +     cfg->fill_size = usr_cfg->fill_size;
> > > +     cfg->comp_size = usr_cfg->comp_size;
> > > +     cfg->frame_size = usr_cfg->frame_size;
> > > +     cfg->frame_headroom = usr_cfg->frame_headroom;  
> >
> > Just optional nit, might be a bit nicer to have it in this form:
> >
> >         cfg->fill_size = usr_cfg ? usr_cfg->fill_size :
> >                          XSK_RING_PROD__DEFAULT_NUM_DESCS;  
> 
> I actually think the current form is clearer when there are multiple
> lines. If there was only one line, I would agree with you.
> 
> > > +}
> > > +
> > > +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
> > > +                                   const struct xsk_socket_config *usr_cfg)
> > > +{
> > > +     if (!usr_cfg) {
> > > +             cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> > > +             cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> > > +             cfg->libbpf_flags = 0;
> > > +             cfg->xdp_flags = 0;
> > > +             cfg->bind_flags = 0;
> > > +             return;
> > > +     }
> > > +
> > > +     cfg->rx_size = usr_cfg->rx_size;
> > > +     cfg->tx_size = usr_cfg->tx_size;
> > > +     cfg->libbpf_flags = usr_cfg->libbpf_flags;
> > > +     cfg->xdp_flags = usr_cfg->xdp_flags;
> > > +     cfg->bind_flags = usr_cfg->bind_flags;  
> >
> > (Ditto)
> >  
> > > +}
> > > +
> > > +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
> > > +                  struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
> > > +                  const struct xsk_umem_config *usr_config)
> > > +{
> > > +     struct xdp_mmap_offsets off;
> > > +     struct xdp_umem_reg mr;
> > > +     struct xsk_umem *umem;
> > > +     socklen_t optlen;
> > > +     void *map;
> > > +     int err;
> > > +
> > > +     if (!umem_area || !umem_ptr || !fill || !comp)
> > > +             return -EFAULT;
> > > +     if (!size && !xsk_page_aligned(umem_area))
> > > +             return -EINVAL;
> > > +
> > > +     umem = calloc(1, sizeof(*umem));
> > > +     if (!umem)
> > > +             return -ENOMEM;
> > > +
> > > +     umem->fd = socket(AF_XDP, SOCK_RAW, 0);
> > > +     if (umem->fd < 0) {
> > > +             err = -errno;
> > > +             goto out_umem_alloc;
> > > +     }
> > > +
> > > +     umem->umem_area = umem_area;
> > > +     xsk_set_umem_config(&umem->config, usr_config);
> > > +
> > > +     mr.addr = (uintptr_t)umem_area;
> > > +     mr.len = size;
> > > +     mr.chunk_size = umem->config.frame_size;
> > > +     mr.headroom = umem->config.frame_headroom;
> > > +
> > > +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
> > > +     if (err) {
> > > +             err = -errno;
> > > +             goto out_socket;
> > > +     }
> > > +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
> > > +                      &umem->config.fill_size,
> > > +                      sizeof(umem->config.fill_size));
> > > +     if (err) {
> > > +             err = -errno;
> > > +             goto out_socket;
> > > +     }
> > > +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
> > > +                      &umem->config.comp_size,
> > > +                      sizeof(umem->config.comp_size));
> > > +     if (err) {
> > > +             err = -errno;
> > > +             goto out_socket;
> > > +     }
> > > +
> > > +     optlen = sizeof(off);
> > > +     err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
> > > +     if (err) {
> > > +             err = -errno;
> > > +             goto out_socket;
> > > +     }
> > > +
> > > +     map = xsk_mmap(NULL, off.fr.desc +
> > > +                    umem->config.fill_size * sizeof(__u64),
> > > +                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> > > +                    umem->fd, XDP_UMEM_PGOFF_FILL_RING);
> > > +     if (map == MAP_FAILED) {
> > > +             err = -errno;
> > > +             goto out_socket;
> > > +     }
> > > +
> > > +     umem->fill = fill;
> > > +     fill->mask = umem->config.fill_size - 1;
> > > +     fill->size = umem->config.fill_size;
> > > +     fill->producer = map + off.fr.producer;
> > > +     fill->consumer = map + off.fr.consumer;
> > > +     fill->ring = map + off.fr.desc;
> > > +     fill->cached_cons = umem->config.fill_size;
> > > +
> > > +     map = xsk_mmap(NULL,
> > > +                    off.cr.desc + umem->config.comp_size * sizeof(__u64),
> > > +                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> > > +                    umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
> > > +     if (map == MAP_FAILED) {
> > > +             err = -errno;
> > > +             goto out_mmap;
> > > +     }
> > > +
> > > +     umem->comp = comp;
> > > +     comp->mask = umem->config.comp_size - 1;
> > > +     comp->size = umem->config.comp_size;
> > > +     comp->producer = map + off.cr.producer;
> > > +     comp->consumer = map + off.cr.consumer;
> > > +     comp->ring = map + off.cr.desc;
> > > +
> > > +     *umem_ptr = umem;
> > > +     return 0;
> > > +
> > > +out_mmap:
> > > +     munmap(umem->fill,
> > > +            off.fr.desc + umem->config.fill_size * sizeof(__u64));
> > > +out_socket:
> > > +     close(umem->fd);
> > > +out_umem_alloc:
> > > +     free(umem);
> > > +     return err;
> > > +}
> > > +
> > > +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
> > > +{
> > > +     struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
> > > +     struct xsk_nl_info *nl_info = cookie;
> > > +     struct ifinfomsg *ifinfo = msg;
> > > +     unsigned char mode;
> > > +     int err;
> > > +
> > > +     if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
> > > +             return 0;
> > > +
> > > +     if (!tb[IFLA_XDP])
> > > +             return 0;
> > > +
> > > +     err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
> > > +                                   NULL);
> > > +     if (err)
> > > +             return err;
> > > +
> > > +     if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
> > > +             return 0;
> > > +
> > > +     mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
> > > +     if (mode == XDP_ATTACHED_NONE)
> > > +             return 0;
> > > +
> > > +     nl_info->xdp_prog_attached = true;
> > > +     nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);  
> >
> > Hm, I don't think this works if I read the intention of this helper correctly.
> >
> > IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the
> > above is a bug.
> >
> > We also have bpf_get_link_xdp_id(). This should probably just be reused in
> > this context here.  
> 
> If bpf_get_link_xdp_id() will fit my bill, I will happily use it. I
> will check it out and hopefully I can drop all this code. Thanks.
>
I see that all you need to know is whether there's already attached XDP program
to xsk socket's related interface, no?
If so, then within the xsk_setup_xdp_prog, you could do something like:

	u32 prog_id = 0;

	bpf_get_link_xdp_id(xsk->ifindex, &prog_id, xsk->config.xdp_flags);
	if (!prog_id) {
		// create maps
		// load xdp prog
	} else {
		xsk->fd = prog_id;
	}

	xsk_update_bpf_maps(xsk, true, xsk->fd);

If that's ok then xsk_xdp_prog_attached and xsk_parse_nl could be dropped.

> > > +     return 0;
> > > +}
> > > +
> > > +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
> > > +{
> > > +     struct xsk_nl_info nl_info;
> > > +     unsigned int nl_pid;
> > > +     char err_buf[256];
> > > +     int sock, err;
> > > +
> > > +     sock = libbpf_netlink_open(&nl_pid);
> > > +     if (sock < 0)
> > > +             return false;
> > > +
> > > +     nl_info.xdp_prog_attached = false;
> > > +     nl_info.ifindex = xsk->ifindex;
> > > +     nl_info.fd = -1;
> > > +
> > > +     err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
> > > +     if (err) {
> > > +             libbpf_strerror(err, err_buf, sizeof(err_buf));
> > > +             pr_warning("Error:\n%s\n", err_buf);
> > > +             close(sock);
> > > +             return false;
> > > +     }
> > > +
> > > +     close(sock);
> > > +     xsk->prog_fd = nl_info.fd;
> > > +     return nl_info.xdp_prog_attached;
> > > +}  
> >
> > (See bpf_get_link_xdp_id().)
> >  
> > > +
> > > +static int xsk_load_xdp_prog(struct xsk_socket *xsk)
> > > +{
> > > +     char bpf_log_buf[BPF_LOG_BUF_SIZE];
> > > +     int err, prog_fd;
> > > +
> > > +     /* This is the C-program:
> > > +      * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
> > > +      * {
> > > +      *     int *qidconf, index = ctx->rx_queue_index;  
> > [...]  
> > > +
> > > +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
> > > +                    __u32 queue_id, struct xsk_umem *umem,
> > > +                    struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
> > > +                    const struct xsk_socket_config *usr_config)
> > > +{
> > > +     struct sockaddr_xdp sxdp = {};
> > > +     struct xdp_mmap_offsets off;
> > > +     struct xsk_socket *xsk;
> > > +     socklen_t optlen;
> > > +     void *map;
> > > +     int err;
> > > +
> > > +     if (!umem || !xsk_ptr || !rx || !tx)
> > > +             return -EFAULT;
> > > +
> > > +     if (umem->refcount) {
> > > +             pr_warning("Error: shared umems not supported by libbpf.\n");
> > > +             return -EBUSY;
> > > +     }
> > > +
> > > +     xsk = calloc(1, sizeof(*xsk));
> > > +     if (!xsk)
> > > +             return -ENOMEM;
> > > +
> > > +     if (umem->refcount++ > 0) {  
> >
> > Should this refcount rather be atomic actually?  
> 
> Neither our config nor data plane interfaces are reentrant for
> performance reasons. Any concurrency has to be handled explicitly on
> the application level. This so that it only penalizes apps that really
> need this.
> 
> Thanks for all your reviews: Magnus
> 
> > > +             xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
> > > +             if (xsk->fd < 0) {
> > > +                     err = -errno;
> > > +                     goto out_xsk_alloc;
> > > +             }
> > > +     } else {
> > > +             xsk->fd = umem->fd;
> > > +     }
> > > +
> > > +     xsk->outstanding_tx = 0;
> > > +     xsk->queue_id = queue_id;
> > > +     xsk->umem = umem;
> > > +     xsk->ifindex = if_nametoindex(ifname);
> > > +     if (!xsk->ifindex) {
> > > +             err = -errno;
> > > +             goto out_socket;
> > > +     }
> > > +
> > > +     xsk_set_xdp_socket_config(&xsk->config, usr_config);  
> > [...]
diff mbox series

Patch

diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
new file mode 100644
index 0000000..caed8b1
--- /dev/null
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -0,0 +1,78 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * if_xdp: XDP socket user-space interface
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include <linux/types.h>
+
+/* Options for the sxdp_flags field */
+#define XDP_SHARED_UMEM	(1 << 0)
+#define XDP_COPY	(1 << 1) /* Force copy-mode */
+#define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
+
+struct sockaddr_xdp {
+	__u16 sxdp_family;
+	__u16 sxdp_flags;
+	__u32 sxdp_ifindex;
+	__u32 sxdp_queue_id;
+	__u32 sxdp_shared_umem_fd;
+};
+
+struct xdp_ring_offset {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets {
+	struct xdp_ring_offset rx;
+	struct xdp_ring_offset tx;
+	struct xdp_ring_offset fr; /* Fill */
+	struct xdp_ring_offset cr; /* Completion */
+};
+
+/* XDP socket options */
+#define XDP_MMAP_OFFSETS		1
+#define XDP_RX_RING			2
+#define XDP_TX_RING			3
+#define XDP_UMEM_REG			4
+#define XDP_UMEM_FILL_RING		5
+#define XDP_UMEM_COMPLETION_RING	6
+#define XDP_STATISTICS			7
+
+struct xdp_umem_reg {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 chunk_size;
+	__u32 headroom;
+};
+
+struct xdp_statistics {
+	__u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+	__u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+	__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+};
+
+/* Pgoff for mmaping the rings */
+#define XDP_PGOFF_RX_RING			  0
+#define XDP_PGOFF_TX_RING		 0x80000000
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
+#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
+
+/* Rx/Tx descriptor */
+struct xdp_desc {
+	__u64 addr;
+	__u32 len;
+	__u32 options;
+};
+
+/* UMEM descriptor is __u64 */
+
+#endif /* _LINUX_IF_XDP_H */
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index bfd9bfc..ee9d536 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@ 
-libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 8479162..761691b 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -164,6 +164,9 @@  $(BPF_IN): force elfdep bpfdep
 	@(test -f ../../include/uapi/linux/if_link.h -a -f ../../../include/uapi/linux/if_link.h && ( \
 	(diff -B ../../include/uapi/linux/if_link.h ../../../include/uapi/linux/if_link.h >/dev/null) || \
 	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h' differs from latest version at 'include/uapi/linux/if_link.h'" >&2 )) || true
+	@(test -f ../../include/uapi/linux/if_xdp.h -a -f ../../../include/uapi/linux/if_xdp.h && ( \
+	(diff -B ../../include/uapi/linux/if_xdp.h ../../../include/uapi/linux/if_xdp.h >/dev/null) || \
+	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true
 	$(Q)$(MAKE) $(build)=libbpf
 
 $(OUTPUT)libbpf.so: $(BPF_IN)
@@ -174,7 +177,7 @@  $(OUTPUT)libbpf.a: $(BPF_IN)
 	$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
 
 $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a
-	$(QUIET_LINK)$(CXX) $^ -lelf -o $@
+	$(QUIET_LINK)$(CXX) $(INCLUDES) $^ -lelf -o $@
 
 check: check_abi
 
diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst
index 607aae4..45e3788 100644
--- a/tools/lib/bpf/README.rst
+++ b/tools/lib/bpf/README.rst
@@ -9,7 +9,7 @@  described here. It's recommended to follow these conventions whenever a
 new function or type is added to keep libbpf API clean and consistent.
 
 All types and functions provided by libbpf API should have one of the
-following prefixes: ``bpf_``, ``btf_``, ``libbpf_``.
+following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``.
 
 System call wrappers
 --------------------
@@ -62,6 +62,15 @@  Auxiliary functions and types that don't fit well in any of categories
 described above should have ``libbpf_`` prefix, e.g.
 ``libbpf_get_error`` or ``libbpf_prog_type_by_name``.
 
+AF_XDP functions
+-------------------
+
+AF_XDP functions should have an ``xsk_`` prefix, e.g.
+``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists
+of both low-level ring access functions and high-level configuration
+functions. These can be mixed and matched. Note that these functions
+are not reentrant for performance reasons.
+
 libbpf ABI
 ==========
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 89c1149..1cc1bb8 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -143,4 +143,11 @@  LIBBPF_0.0.2 {
 		btf_ext__new;
 		btf_ext__reloc_func_info;
 		btf_ext__reloc_line_info;
+		xsk_umem__create;
+		xsk_socket__create;
+		xsk_umem__delete;
+		xsk_socket__delete;
+		xsk_umem__get_data;
+		xsk_umem__fd;
+		xsk_socket__fd;
 } LIBBPF_0.0.1;
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
new file mode 100644
index 0000000..a982a76
--- /dev/null
+++ b/tools/lib/bpf/xsk.c
@@ -0,0 +1,742 @@ 
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <asm/barrier.h>
+#include <linux/compiler.h>
+#include <linux/filter.h>
+#include <linux/if_ether.h>
+#include <linux/if_link.h>
+#include <linux/if_packet.h>
+#include <linux/if_xdp.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "libbpf_util.h"
+#include "nlattr.h"
+#include "xsk.h"
+
+#ifndef SOL_XDP
+ #define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+ #define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+ #define PF_XDP AF_XDP
+#endif
+
+struct xsk_umem {
+	struct xsk_ring_prod *fill;
+	struct xsk_ring_cons *comp;
+	char *umem_area;
+	struct xsk_umem_config config;
+	int fd;
+	int refcount;
+};
+
+struct xsk_socket {
+	struct xsk_ring_cons *rx;
+	struct xsk_ring_prod *tx;
+	__u64 outstanding_tx;
+	struct xsk_umem *umem;
+	struct xsk_socket_config config;
+	int fd;
+	int xsks_map;
+	int ifindex;
+	int prog_fd;
+	int qidconf_map_fd;
+	int xsks_map_fd;
+	__u32 queue_id;
+};
+
+struct xsk_nl_info {
+	bool xdp_prog_attached;
+	int ifindex;
+	int fd;
+};
+
+#define MAX_QUEUES 128
+
+/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
+ * Unfortunately, it is not part of glibc.
+ */
+static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags,
+			     int fd, __u64 offset)
+{
+#ifdef __NR_mmap2
+	unsigned int page_shift = __builtin_ffs(getpagesize()) - 1;
+	long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd,
+			   (off_t)(offset >> page_shift));
+
+	return (void *)ret;
+#else
+	return mmap(addr, length, prot, flags, fd, offset);
+#endif
+}
+
+void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
+{
+	return &((char *)(umem->umem_area))[addr];
+}
+
+int xsk_umem__fd(const struct xsk_umem *umem)
+{
+	return umem ? umem->fd : -EINVAL;
+}
+
+int xsk_socket__fd(const struct xsk_socket *xsk)
+{
+	return xsk ? xsk->fd : -EINVAL;
+}
+
+static bool xsk_page_aligned(void *buffer)
+{
+	unsigned long addr = (unsigned long)buffer;
+
+	return !(addr & (getpagesize() - 1));
+}
+
+static void xsk_set_umem_config(struct xsk_umem_config *cfg,
+				const struct xsk_umem_config *usr_cfg)
+{
+	if (!usr_cfg) {
+		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+		return;
+	}
+
+	cfg->fill_size = usr_cfg->fill_size;
+	cfg->comp_size = usr_cfg->comp_size;
+	cfg->frame_size = usr_cfg->frame_size;
+	cfg->frame_headroom = usr_cfg->frame_headroom;
+}
+
+static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
+				      const struct xsk_socket_config *usr_cfg)
+{
+	if (!usr_cfg) {
+		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+		cfg->libbpf_flags = 0;
+		cfg->xdp_flags = 0;
+		cfg->bind_flags = 0;
+		return;
+	}
+
+	cfg->rx_size = usr_cfg->rx_size;
+	cfg->tx_size = usr_cfg->tx_size;
+	cfg->libbpf_flags = usr_cfg->libbpf_flags;
+	cfg->xdp_flags = usr_cfg->xdp_flags;
+	cfg->bind_flags = usr_cfg->bind_flags;
+}
+
+int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
+		     struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
+		     const struct xsk_umem_config *usr_config)
+{
+	struct xdp_mmap_offsets off;
+	struct xdp_umem_reg mr;
+	struct xsk_umem *umem;
+	socklen_t optlen;
+	void *map;
+	int err;
+
+	if (!umem_area || !umem_ptr || !fill || !comp)
+		return -EFAULT;
+	if (!size && !xsk_page_aligned(umem_area))
+		return -EINVAL;
+
+	umem = calloc(1, sizeof(*umem));
+	if (!umem)
+		return -ENOMEM;
+
+	umem->fd = socket(AF_XDP, SOCK_RAW, 0);
+	if (umem->fd < 0) {
+		err = -errno;
+		goto out_umem_alloc;
+	}
+
+	umem->umem_area = umem_area;
+	xsk_set_umem_config(&umem->config, usr_config);
+
+	mr.addr = (uintptr_t)umem_area;
+	mr.len = size;
+	mr.chunk_size = umem->config.frame_size;
+	mr.headroom = umem->config.frame_headroom;
+
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
+			 &umem->config.fill_size,
+			 sizeof(umem->config.fill_size));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
+			 &umem->config.comp_size,
+			 sizeof(umem->config.comp_size));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	optlen = sizeof(off);
+	err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	map = xsk_mmap(NULL, off.fr.desc +
+		       umem->config.fill_size * sizeof(__u64),
+		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+		       umem->fd, XDP_UMEM_PGOFF_FILL_RING);
+	if (map == MAP_FAILED) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	umem->fill = fill;
+	fill->mask = umem->config.fill_size - 1;
+	fill->size = umem->config.fill_size;
+	fill->producer = map + off.fr.producer;
+	fill->consumer = map + off.fr.consumer;
+	fill->ring = map + off.fr.desc;
+	fill->cached_cons = umem->config.fill_size;
+
+	map = xsk_mmap(NULL,
+		       off.cr.desc + umem->config.comp_size * sizeof(__u64),
+		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+		       umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
+	if (map == MAP_FAILED) {
+		err = -errno;
+		goto out_mmap;
+	}
+
+	umem->comp = comp;
+	comp->mask = umem->config.comp_size - 1;
+	comp->size = umem->config.comp_size;
+	comp->producer = map + off.cr.producer;
+	comp->consumer = map + off.cr.consumer;
+	comp->ring = map + off.cr.desc;
+
+	*umem_ptr = umem;
+	return 0;
+
+out_mmap:
+	munmap(umem->fill,
+	       off.fr.desc + umem->config.fill_size * sizeof(__u64));
+out_socket:
+	close(umem->fd);
+out_umem_alloc:
+	free(umem);
+	return err;
+}
+
+static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
+{
+	struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
+	struct xsk_nl_info *nl_info = cookie;
+	struct ifinfomsg *ifinfo = msg;
+	unsigned char mode;
+	int err;
+
+	if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
+		return 0;
+
+	if (!tb[IFLA_XDP])
+		return 0;
+
+	err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
+				      NULL);
+	if (err)
+		return err;
+
+	if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
+		return 0;
+
+	mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
+	if (mode == XDP_ATTACHED_NONE)
+		return 0;
+
+	nl_info->xdp_prog_attached = true;
+	nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);
+	return 0;
+}
+
+static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
+{
+	struct xsk_nl_info nl_info;
+	unsigned int nl_pid;
+	char err_buf[256];
+	int sock, err;
+
+	sock = libbpf_netlink_open(&nl_pid);
+	if (sock < 0)
+		return false;
+
+	nl_info.xdp_prog_attached = false;
+	nl_info.ifindex = xsk->ifindex;
+	nl_info.fd = -1;
+
+	err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
+	if (err) {
+		libbpf_strerror(err, err_buf, sizeof(err_buf));
+		pr_warning("Error:\n%s\n", err_buf);
+		close(sock);
+		return false;
+	}
+
+	close(sock);
+	xsk->prog_fd = nl_info.fd;
+	return nl_info.xdp_prog_attached;
+}
+
+static int xsk_load_xdp_prog(struct xsk_socket *xsk)
+{
+	char bpf_log_buf[BPF_LOG_BUF_SIZE];
+	int err, prog_fd;
+
+	/* This is the C-program:
+	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+	 * {
+	 *     int *qidconf, index = ctx->rx_queue_index;
+	 *
+	 *     // A set entry here means that the correspnding queue_id
+	 *     // has an active AF_XDP socket bound to it.
+	 *     qidconf = bpf_map_lookup_elem(&qidconf_map, &index);
+	 *     if (!qidconf)
+	 *         return XDP_ABORTED;
+	 *
+	 *     if (*qidconf)
+	 *         return bpf_redirect_map(&xsks_map, index, 0);
+	 *
+	 *     return XDP_PASS;
+	 * }
+	 */
+	struct bpf_insn prog[] = {
+		/* r1 = *(u32 *)(r1 + 16) */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 16),
+		/* *(u32 *)(r10 - 4) = r1 */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -4),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+		BPF_LD_MAP_FD(BPF_REG_1, xsk->qidconf_map_fd),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		/* if r1 == 0 goto +8 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8),
+		BPF_MOV32_IMM(BPF_REG_0, 2),
+		/* r1 = *(u32 *)(r1 + 0) */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+		/* if r1 == 0 goto +5 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
+		/* r2 = *(u32 *)(r10 - 4) */
+		BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd),
+		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
+		BPF_MOV32_IMM(BPF_REG_3, 0),
+		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
+		/* The jumps are to this instruction */
+		BPF_EXIT_INSN(),
+	};
+	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+	prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
+				   "LGPL-2.1 or BSD-2-Clause", 0, bpf_log_buf,
+				   BPF_LOG_BUF_SIZE);
+	if (prog_fd < 0) {
+		pr_warning("BPF log buffer:\n%s", bpf_log_buf);
+		return prog_fd;
+	}
+
+	err = bpf_set_link_xdp_fd(xsk->ifindex, prog_fd, xsk->config.xdp_flags);
+	if (err) {
+		close(prog_fd);
+		return err;
+	}
+
+	xsk->prog_fd = prog_fd;
+	return 0;
+}
+
+static int xsk_create_bpf_maps(struct xsk_socket *xsk)
+{
+	int fd;
+
+	fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "qidconf_map",
+				 sizeof(int), sizeof(int), MAX_QUEUES, 0);
+	if (fd < 0)
+		return fd;
+	xsk->qidconf_map_fd = fd;
+
+	fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
+				 sizeof(int), sizeof(int), MAX_QUEUES, 0);
+	if (fd < 0) {
+		close(xsk->qidconf_map_fd);
+		return fd;
+	}
+	xsk->xsks_map_fd = fd;
+
+	return 0;
+}
+
+static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
+{
+	close(xsk->qidconf_map_fd);
+	close(xsk->xsks_map_fd);
+}
+
+static int xsk_update_bpf_maps(struct xsk_socket *xsk, int qidconf_value,
+			       int xsks_value)
+{
+	bool qidconf_map_updated = false, xsks_map_updated = false;
+	struct bpf_prog_info prog_info = {};
+	__u32 prog_len = sizeof(prog_info);
+	struct bpf_map_info map_info;
+	__u32 map_len = sizeof(map_info);
+	__u32 *map_ids;
+	int reset_value = 0;
+	__u32 num_maps;
+	unsigned int i;
+	int err;
+
+	err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
+	if (err)
+		return err;
+
+	num_maps = prog_info.nr_map_ids;
+
+	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
+	if (!map_ids)
+		return -ENOMEM;
+
+	memset(&prog_info, 0, prog_len);
+	prog_info.nr_map_ids = num_maps;
+	prog_info.map_ids = (__u64)(unsigned long)map_ids;
+
+	err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
+	if (err)
+		goto out_map_ids;
+
+	for (i = 0; i < prog_info.nr_map_ids; i++) {
+		int fd;
+
+		fd = bpf_map_get_fd_by_id(map_ids[i]);
+		if (fd < 0) {
+			err = -errno;
+			goto out_maps;
+		}
+
+		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
+		if (err)
+			goto out_maps;
+
+		if (!strcmp(map_info.name, "qidconf_map")) {
+			err = bpf_map_update_elem(fd, &xsk->queue_id,
+						  &qidconf_value, 0);
+			if (err)
+				goto out_maps;
+			qidconf_map_updated = true;
+			xsk->qidconf_map_fd = fd;
+		} else if (!strcmp(map_info.name, "xsks_map")) {
+			err = bpf_map_update_elem(fd, &xsk->queue_id,
+						  &xsks_value, 0);
+			if (err)
+				goto out_maps;
+			xsks_map_updated = true;
+			xsk->xsks_map_fd = fd;
+		}
+
+		if (qidconf_map_updated && xsks_map_updated)
+			break;
+	}
+
+	if (!(qidconf_map_updated && xsks_map_updated)) {
+		err = -ENOENT;
+		goto out_maps;
+	}
+
+	err = 0;
+	goto out_success;
+
+out_maps:
+	if (qidconf_map_updated)
+		(void)bpf_map_update_elem(xsk->qidconf_map_fd, &xsk->queue_id,
+					  &reset_value, 0);
+	if (xsks_map_updated)
+		(void)bpf_map_update_elem(xsk->xsks_map_fd, &xsk->queue_id,
+					  &reset_value, 0);
+out_success:
+	if (qidconf_map_updated)
+		close(xsk->qidconf_map_fd);
+	if (xsks_map_updated)
+		close(xsk->xsks_map_fd);
+out_map_ids:
+	free(map_ids);
+	return err;
+}
+
+static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
+{
+	bool prog_attached = false;
+	int err;
+
+	if (!xsk_xdp_prog_attached(xsk)) {
+		prog_attached = true;
+		err = xsk_create_bpf_maps(xsk);
+		if (err)
+			return err;
+
+		err = xsk_load_xdp_prog(xsk);
+		if (err)
+			goto out_maps;
+	}
+
+	err = xsk_update_bpf_maps(xsk, true, xsk->fd);
+	if (err)
+		goto out_load;
+
+	return 0;
+
+out_load:
+	if (prog_attached)
+		close(xsk->prog_fd);
+out_maps:
+	if (prog_attached)
+		xsk_delete_bpf_maps(xsk);
+	return err;
+}
+
+int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
+		       __u32 queue_id, struct xsk_umem *umem,
+		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
+		       const struct xsk_socket_config *usr_config)
+{
+	struct sockaddr_xdp sxdp = {};
+	struct xdp_mmap_offsets off;
+	struct xsk_socket *xsk;
+	socklen_t optlen;
+	void *map;
+	int err;
+
+	if (!umem || !xsk_ptr || !rx || !tx)
+		return -EFAULT;
+
+	if (umem->refcount) {
+		pr_warning("Error: shared umems not supported by libbpf.\n");
+		return -EBUSY;
+	}
+
+	xsk = calloc(1, sizeof(*xsk));
+	if (!xsk)
+		return -ENOMEM;
+
+	if (umem->refcount++ > 0) {
+		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
+		if (xsk->fd < 0) {
+			err = -errno;
+			goto out_xsk_alloc;
+		}
+	} else {
+		xsk->fd = umem->fd;
+	}
+
+	xsk->outstanding_tx = 0;
+	xsk->queue_id = queue_id;
+	xsk->umem = umem;
+	xsk->ifindex = if_nametoindex(ifname);
+	if (!xsk->ifindex) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	xsk_set_xdp_socket_config(&xsk->config, usr_config);
+
+	if (rx) {
+		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
+				 &xsk->config.rx_size,
+				 sizeof(xsk->config.rx_size));
+		if (err) {
+			err = -errno;
+			goto out_socket;
+		}
+	}
+	if (tx) {
+		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
+				 &xsk->config.tx_size,
+				 sizeof(xsk->config.tx_size));
+		if (err) {
+			err = -errno;
+			goto out_socket;
+		}
+	}
+
+	optlen = sizeof(off);
+	err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	if (rx) {
+		map = xsk_mmap(NULL, off.rx.desc +
+			       xsk->config.rx_size * sizeof(struct xdp_desc),
+			       PROT_READ | PROT_WRITE,
+			       MAP_SHARED | MAP_POPULATE,
+			       xsk->fd, XDP_PGOFF_RX_RING);
+		if (map == MAP_FAILED) {
+			err = -errno;
+			goto out_socket;
+		}
+
+		rx->mask = xsk->config.rx_size - 1;
+		rx->size = xsk->config.rx_size;
+		rx->producer = map + off.rx.producer;
+		rx->consumer = map + off.rx.consumer;
+		rx->ring = map + off.rx.desc;
+	}
+	xsk->rx = rx;
+
+	if (tx) {
+		map = xsk_mmap(NULL, off.tx.desc +
+			       xsk->config.tx_size * sizeof(struct xdp_desc),
+			       PROT_READ | PROT_WRITE,
+			       MAP_SHARED | MAP_POPULATE,
+			       xsk->fd, XDP_PGOFF_TX_RING);
+		if (map == MAP_FAILED) {
+			err = -errno;
+			goto out_mmap_rx;
+		}
+
+		tx->mask = xsk->config.tx_size - 1;
+		tx->size = xsk->config.tx_size;
+		tx->producer = map + off.tx.producer;
+		tx->consumer = map + off.tx.consumer;
+		tx->ring = map + off.tx.desc;
+		tx->cached_cons = xsk->config.tx_size;
+	}
+	xsk->tx = tx;
+
+	sxdp.sxdp_family = PF_XDP;
+	sxdp.sxdp_ifindex = xsk->ifindex;
+	sxdp.sxdp_queue_id = xsk->queue_id;
+	sxdp.sxdp_flags = xsk->config.bind_flags;
+
+	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
+	if (err) {
+		err = -errno;
+		goto out_mmap_tx;
+	}
+
+	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
+		err = xsk_setup_xdp_prog(xsk);
+		if (err)
+			goto out_mmap_tx;
+	}
+
+	*xsk_ptr = xsk;
+	return 0;
+
+out_mmap_tx:
+	if (tx)
+		munmap(xsk->tx,
+		       off.tx.desc +
+		       xsk->config.tx_size * sizeof(struct xdp_desc));
+out_mmap_rx:
+	if (rx)
+		munmap(xsk->rx,
+		       off.rx.desc +
+		       xsk->config.rx_size * sizeof(struct xdp_desc));
+out_socket:
+	if (--umem->refcount)
+		close(xsk->fd);
+out_xsk_alloc:
+	free(xsk);
+	return err;
+}
+
+int xsk_umem__delete(struct xsk_umem *umem)
+{
+	struct xdp_mmap_offsets off;
+	socklen_t optlen;
+	int err;
+
+	if (!umem)
+		return 0;
+
+	if (umem->refcount)
+		return -EBUSY;
+
+	optlen = sizeof(off);
+	err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (!err) {
+		munmap(umem->fill->ring,
+		       off.fr.desc + umem->config.fill_size * sizeof(__u64));
+		munmap(umem->comp->ring,
+		       off.cr.desc + umem->config.comp_size * sizeof(__u64));
+	}
+
+	close(umem->fd);
+	free(umem);
+
+	return 0;
+}
+
+void xsk_socket__delete(struct xsk_socket *xsk)
+{
+	struct xdp_mmap_offsets off;
+	socklen_t optlen;
+	int err;
+
+	if (!xsk)
+		return;
+
+	(void)xsk_update_bpf_maps(xsk, 0, 0);
+
+	optlen = sizeof(off);
+	err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (!err) {
+		if (xsk->rx)
+			munmap(xsk->rx->ring,
+			       off.rx.desc +
+			       xsk->config.rx_size * sizeof(struct xdp_desc));
+		if (xsk->tx)
+			munmap(xsk->tx->ring,
+			       off.tx.desc +
+			       xsk->config.tx_size * sizeof(struct xdp_desc));
+	}
+
+	xsk->umem->refcount--;
+	/* Do not close an fd that also has an associated umem connected
+	 * to it.
+	 */
+	if (xsk->fd != xsk->umem->fd)
+		close(xsk->fd);
+	free(xsk);
+}
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
new file mode 100644
index 0000000..a1ab8c9
--- /dev/null
+++ b/tools/lib/bpf/xsk.h
@@ -0,0 +1,205 @@ 
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef __LIBBPF_XSK_H
+#define __LIBBPF_XSK_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <linux/if_xdp.h>
+
+#include "libbpf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Do not access these members directly. Use the functions below. */
+#define DEFINE_XSK_RING(name) \
+struct name { \
+	__u32 cached_prod; \
+	__u32 cached_cons; \
+	__u32 mask; \
+	__u32 size; \
+	__u32 *producer; \
+	__u32 *consumer; \
+	void *ring; \
+}
+
+DEFINE_XSK_RING(xsk_ring_prod);
+DEFINE_XSK_RING(xsk_ring_cons);
+
+struct xsk_umem;
+struct xsk_socket;
+
+static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill,
+					      __u32 idx)
+{
+	__u64 *addrs = (__u64 *)fill->ring;
+
+	return &addrs[idx & fill->mask];
+}
+
+static inline const __u64 *
+xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx)
+{
+	const __u64 *addrs = (const __u64 *)comp->ring;
+
+	return &addrs[idx & comp->mask];
+}
+
+static inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx,
+						      __u32 idx)
+{
+	struct xdp_desc *descs = (struct xdp_desc *)tx->ring;
+
+	return &descs[idx & tx->mask];
+}
+
+static inline const struct xdp_desc *
+xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
+{
+	const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring;
+
+	return &descs[idx & rx->mask];
+}
+
+static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
+{
+	__u32 free_entries = r->cached_cons - r->cached_prod;
+
+	if (free_entries >= nb)
+		return free_entries;
+
+	/* Refresh the local tail pointer.
+	 * cached_cons is r->size bigger than the real consumer pointer so
+	 * that this addition can be avoided in the more frequently
+	 * executed code that computs free_entries in the beginning of
+	 * this function. Without this optimization it whould have been
+	 * free_entries = r->cached_prod - r->cached_cons + r->size.
+	 */
+	r->cached_cons = *r->consumer + r->size;
+
+	return r->cached_cons - r->cached_prod;
+}
+
+static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
+{
+	__u32 entries = r->cached_prod - r->cached_cons;
+
+	if (entries == 0) {
+		r->cached_prod = *r->producer;
+		entries = r->cached_prod - r->cached_cons;
+	}
+
+	return (entries > nb) ? nb : entries;
+}
+
+static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod,
+					    size_t nb, __u32 *idx)
+{
+	if (unlikely(xsk_prod_nb_free(prod, nb) < nb))
+		return 0;
+
+	*idx = prod->cached_prod;
+	prod->cached_prod += nb;
+
+	return nb;
+}
+
+static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb)
+{
+	/* Make sure everything has been written to the ring before signalling
+	 * this to the kernel.
+	 */
+	smp_wmb();
+
+	*prod->producer += nb;
+}
+
+static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons,
+					 size_t nb, __u32 *idx)
+{
+	size_t entries = xsk_cons_nb_avail(cons, nb);
+
+	if (likely(entries > 0)) {
+		/* Make sure we do not speculatively read the data before
+		 * we have received the packet buffers from the ring.
+		 */
+		smp_rmb();
+
+		*idx = cons->cached_cons;
+		cons->cached_cons += entries;
+	}
+
+	return entries;
+}
+
+static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb)
+{
+	*cons->consumer += nb;
+}
+
+static inline void *xsk_umem__get_data_raw(void *umem_area, __u64 addr)
+{
+	return &((char *)umem_area)[addr];
+}
+
+LIBBPF_API void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr);
+
+LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem);
+LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
+
+#define XSK_RING_CONS__DEFAULT_NUM_DESCS      2048
+#define XSK_RING_PROD__DEFAULT_NUM_DESCS      2048
+#define XSK_UMEM__DEFAULT_FRAME_SHIFT    11 /* 2048 bytes */
+#define XSK_UMEM__DEFAULT_FRAME_SIZE     (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
+#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
+
+struct xsk_umem_config {
+	__u32 fill_size;
+	__u32 comp_size;
+	__u32 frame_size;
+	__u32 frame_headroom;
+};
+
+/* Flags for the libbpf_flags field. */
+#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0)
+
+struct xsk_socket_config {
+	__u32 rx_size;
+	__u32 tx_size;
+	__u32 libbpf_flags;
+	__u32 xdp_flags;
+	__u16 bind_flags;
+};
+
+/* Set config to NULL to get the default configuration. */
+LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
+				void *umem_area, __u64 size,
+				struct xsk_ring_prod *fill,
+				struct xsk_ring_cons *comp,
+				const struct xsk_umem_config *config);
+LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk,
+				  const char *ifname, __u32 queue_id,
+				  struct xsk_umem *umem,
+				  struct xsk_ring_cons *rx,
+				  struct xsk_ring_prod *tx,
+				  const struct xsk_socket_config *config);
+
+/* Returns 0 for success and -EBUSY if the umem is still in use. */
+LIBBPF_API int xsk_umem__delete(struct xsk_umem *umem);
+LIBBPF_API void xsk_socket__delete(struct xsk_socket *xsk);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* __LIBBPF_XSK_H */