@@ -2957,6 +2957,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
S: Maintained
F: net/netmap.c
+AF_XDP network backend
+R: Ilya Maximets <i.maximets@ovn.org>
+F: net/af-xdp.c
+
Host Memory Backends
M: David Hildenbrand <david@redhat.com>
M: Igor Mammedov <imammedo@redhat.com>
@@ -1296,6 +1296,9 @@ ERST
.name = "netdev_add",
.args_type = "netdev:O",
.params = "[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
+#ifdef CONFIG_AF_XDP
+ "|af-xdp"
+#endif
#ifdef CONFIG_VMNET
"|vmnet-host|vmnet-shared|vmnet-bridged"
#endif
@@ -1873,6 +1873,13 @@ if libbpf.found() and not cc.links('''
endif
endif
+# libxdp
+libxdp = not_found
+if not get_option('af_xdp').auto() or have_system
+ libxdp = dependency('libxdp', required: get_option('af_xdp'),
+ version: '>=1.4.0', method: 'pkg-config')
+endif
+
# libdw
libdw = not_found
if not get_option('libdw').auto() or \
@@ -2099,6 +2106,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', get_option('hexagon_idef_pars
config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
config_host_data.set('CONFIG_EBPF', libbpf.found())
+config_host_data.set('CONFIG_AF_XDP', libxdp.found())
config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
config_host_data.set('CONFIG_LIBNFS', libnfs.found())
@@ -4270,6 +4278,7 @@ summary_info = {}
if targetos == 'darwin'
summary_info += {'vmnet.framework support': vmnet}
endif
+summary_info += {'AF_XDP support': libxdp}
summary_info += {'slirp support': slirp}
summary_info += {'vde support': vde}
summary_info += {'netmap support': have_netmap}
@@ -122,6 +122,8 @@ option('avx512bw', type: 'feature', value: 'auto',
option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
+option('af_xdp', type : 'feature', value : 'auto',
+ description: 'AF_XDP network backend support')
option('attr', type : 'feature', value : 'auto',
description: 'attr/xattr support')
option('auth_pam', type : 'feature', value : 'auto',
new file mode 100644
@@ -0,0 +1,526 @@
+/*
+ * AF_XDP network backend.
+ *
+ * Copyright (c) 2023 Red Hat, Inc.
+ *
+ * Authors:
+ * Ilya Maximets <i.maximets@ovn.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+
+#include "qemu/osdep.h"
+#include <bpf/bpf.h>
+#include <inttypes.h>
+#include <linux/if_link.h>
+#include <linux/if_xdp.h>
+#include <net/if.h>
+#include <xdp/xsk.h>
+
+#include "clients.h"
+#include "monitor/monitor.h"
+#include "net/net.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "qemu/memalign.h"
+
+
+typedef struct AFXDPState {
+ NetClientState nc;
+
+ struct xsk_socket *xsk;
+ struct xsk_ring_cons rx;
+ struct xsk_ring_prod tx;
+ struct xsk_ring_cons cq;
+ struct xsk_ring_prod fq;
+
+ char ifname[IFNAMSIZ];
+ int ifindex;
+ bool read_poll;
+ bool write_poll;
+ uint32_t outstanding_tx;
+
+ uint64_t *pool;
+ uint32_t n_pool;
+ char *buffer;
+ struct xsk_umem *umem;
+
+ uint32_t n_queues;
+ uint32_t xdp_flags;
+ bool inhibit;
+} AFXDPState;
+
+#define AF_XDP_BATCH_SIZE 64
+
+static void af_xdp_send(void *opaque);
+static void af_xdp_writable(void *opaque);
+
+/* Set the event-loop handlers for the af-xdp backend. */
+static void af_xdp_update_fd_handler(AFXDPState *s)
+{
+ qemu_set_fd_handler(xsk_socket__fd(s->xsk),
+ s->read_poll ? af_xdp_send : NULL,
+ s->write_poll ? af_xdp_writable : NULL,
+ s);
+}
+
+/* Update the read handler. */
+static void af_xdp_read_poll(AFXDPState *s, bool enable)
+{
+ if (s->read_poll != enable) {
+ s->read_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+/* Update the write handler. */
+static void af_xdp_write_poll(AFXDPState *s, bool enable)
+{
+ if (s->write_poll != enable) {
+ s->write_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+static void af_xdp_poll(NetClientState *nc, bool enable)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ if (s->read_poll != enable || s->write_poll != enable) {
+ s->write_poll = enable;
+ s->read_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+static void af_xdp_complete_tx(AFXDPState *s)
+{
+ uint32_t idx = 0;
+ uint32_t done, i;
+ uint64_t *addr;
+
+ done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
+
+ for (i = 0; i < done; i++) {
+ addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
+ s->pool[s->n_pool++] = *addr;
+ s->outstanding_tx--;
+ }
+
+ if (done) {
+ xsk_ring_cons__release(&s->cq, done);
+ }
+}
+
+/*
+ * The fd_write() callback, invoked if the fd is marked as writable
+ * after a poll.
+ */
+static void af_xdp_writable(void *opaque)
+{
+ AFXDPState *s = opaque;
+
+ /* Try to recover buffers that are already sent. */
+ af_xdp_complete_tx(s);
+
+ /*
+ * Unregister the handler, unless we still have packets to transmit
+ * and kernel needs a wake up.
+ */
+ if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
+ af_xdp_write_poll(s, false);
+ }
+
+ /* Flush any buffered packets. */
+ qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t af_xdp_receive(NetClientState *nc,
+ const uint8_t *buf, size_t size)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+ struct xdp_desc *desc;
+ uint32_t idx;
+ void *data;
+
+ /* Try to recover buffers that are already sent. */
+ af_xdp_complete_tx(s);
+
+ if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
+ /* We can't transmit packet this size... */
+ return size;
+ }
+
+ if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
+ /*
+ * Out of buffers or space in tx ring. Poll until we can write.
+ * This will also kick the Tx, if it was waiting on CQ.
+ */
+ af_xdp_write_poll(s, true);
+ return 0;
+ }
+
+ desc = xsk_ring_prod__tx_desc(&s->tx, idx);
+ desc->addr = s->pool[--s->n_pool];
+ desc->len = size;
+
+ data = xsk_umem__get_data(s->buffer, desc->addr);
+ memcpy(data, buf, size);
+
+ xsk_ring_prod__submit(&s->tx, 1);
+ s->outstanding_tx++;
+
+ if (xsk_ring_prod__needs_wakeup(&s->tx)) {
+ af_xdp_write_poll(s, true);
+ }
+
+ return size;
+}
+
+/*
+ * Complete a previous send (backend --> guest) and enable the
+ * fd_read callback.
+ */
+static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ af_xdp_read_poll(s, true);
+}
+
+static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
+{
+ uint32_t i, idx = 0;
+
+ /* Leave one packet for Tx, just in case. */
+ if (s->n_pool < n + 1) {
+ n = s->n_pool;
+ }
+
+ if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
+ }
+ xsk_ring_prod__submit(&s->fq, n);
+
+ if (xsk_ring_prod__needs_wakeup(&s->fq)) {
+ /* Receive was blocked by not having enough buffers. Wake it up. */
+ af_xdp_read_poll(s, true);
+ }
+}
+
+static void af_xdp_send(void *opaque)
+{
+ uint32_t i, n_rx, idx = 0;
+ AFXDPState *s = opaque;
+
+ n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
+ if (!n_rx) {
+ return;
+ }
+
+ for (i = 0; i < n_rx; i++) {
+ const struct xdp_desc *desc;
+ struct iovec iov;
+
+ desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
+
+ iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
+ iov.iov_len = desc->len;
+
+ s->pool[s->n_pool++] = desc->addr;
+
+ if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
+ af_xdp_send_completed)) {
+ /*
+ * The peer does not receive anymore. Packet is queued, stop
+ * reading from the backend until af_xdp_send_completed().
+ */
+ af_xdp_read_poll(s, false);
+
+ /* Return unused descriptors to not break the ring cache. */
+ xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
+ n_rx = i + 1;
+ break;
+ }
+ }
+
+ /* Release actually sent descriptors and try to re-fill. */
+ xsk_ring_cons__release(&s->rx, n_rx);
+ af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
+}
+
+/* Flush and close. */
+static void af_xdp_cleanup(NetClientState *nc)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ qemu_purge_queued_packets(nc);
+
+ af_xdp_poll(nc, false);
+
+ xsk_socket__delete(s->xsk);
+ s->xsk = NULL;
+ g_free(s->pool);
+ s->pool = NULL;
+ xsk_umem__delete(s->umem);
+ s->umem = NULL;
+ qemu_vfree(s->buffer);
+ s->buffer = NULL;
+
+ /* Remove the program if it's the last open queue. */
+ if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
+ && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
+ fprintf(stderr,
+ "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
+ s->ifname, s->ifindex);
+ }
+}
+
+static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
+{
+ struct xsk_umem_config config = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+ .frame_headroom = 0,
+ };
+ uint64_t n_descs;
+ uint64_t size;
+ int64_t i;
+ int ret;
+
+ /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
+ n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
+ + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
+ size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
+
+ s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
+ memset(s->buffer, 0, size);
+
+ if (sock_fd < 0) {
+ ret = xsk_umem__create(&s->umem, s->buffer, size,
+ &s->fq, &s->cq, &config);
+ } else {
+ ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
+ &s->fq, &s->cq, &config);
+ }
+
+ if (ret) {
+ qemu_vfree(s->buffer);
+ error_setg_errno(errp, errno,
+ "failed to create umem for %s queue_index: %d",
+ s->ifname, s->nc.queue_index);
+ return -1;
+ }
+
+ s->pool = g_new(uint64_t, n_descs);
+ /* Fill the pool in the opposite order, because it's a LIFO queue. */
+ for (i = n_descs; i >= 0; i--) {
+ s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
+ }
+ s->n_pool = n_descs;
+
+ af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
+
+ return 0;
+}
+
+static int af_xdp_socket_create(AFXDPState *s,
+ const NetdevAFXDPOptions *opts, Error **errp)
+{
+ struct xsk_socket_config cfg = {
+ .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .libxdp_flags = 0,
+ .bind_flags = XDP_USE_NEED_WAKEUP,
+ .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
+ };
+ int queue_id, error = 0;
+
+ s->inhibit = opts->has_inhibit && opts->inhibit;
+ if (s->inhibit) {
+ cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
+ }
+
+ if (opts->has_force_copy && opts->force_copy) {
+ cfg.bind_flags |= XDP_COPY;
+ }
+
+ queue_id = s->nc.queue_index;
+ if (opts->has_start_queue && opts->start_queue > 0) {
+ queue_id += opts->start_queue;
+ }
+
+ if (opts->has_mode) {
+ /* Specific mode requested. */
+ cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
+ ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ error = errno;
+ }
+ } else {
+ /* No mode requested, try native first. */
+ cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ /* Can't use native mode, try skb. */
+ cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
+ cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
+
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ error = errno;
+ }
+ }
+ }
+
+ if (error) {
+ error_setg_errno(errp, error,
+ "failed to create AF_XDP socket for %s queue_id: %d",
+ s->ifname, queue_id);
+ return -1;
+ }
+
+ s->xdp_flags = cfg.xdp_flags;
+
+ return 0;
+}
+
+/* NetClientInfo methods. */
+static NetClientInfo net_af_xdp_info = {
+ .type = NET_CLIENT_DRIVER_AF_XDP,
+ .size = sizeof(AFXDPState),
+ .receive = af_xdp_receive,
+ .poll = af_xdp_poll,
+ .cleanup = af_xdp_cleanup,
+};
+
+static int *parse_socket_fds(const char *sock_fds_str,
+ int64_t n_expected, Error **errp)
+{
+ gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
+ int64_t i, n_sock_fds = g_strv_length(substrings);
+ int *sock_fds = NULL;
+
+ if (n_sock_fds != n_expected) {
+ error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
+ n_expected, n_sock_fds);
+ goto exit;
+ }
+
+ sock_fds = g_new(int, n_sock_fds);
+
+ for (i = 0; i < n_sock_fds; i++) {
+ sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
+ if (sock_fds[i] < 0) {
+ g_free(sock_fds);
+ sock_fds = NULL;
+ goto exit;
+ }
+ }
+
+exit:
+ g_strfreev(substrings);
+ return sock_fds;
+}
+
+/*
+ * The exported init function.
+ *
+ * ... -netdev af-xdp,ifname="..."
+ */
+int net_init_af_xdp(const Netdev *netdev,
+ const char *name, NetClientState *peer, Error **errp)
+{
+ const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
+ NetClientState *nc, *nc0 = NULL;
+ unsigned int ifindex;
+ uint32_t prog_id = 0;
+ int *sock_fds = NULL;
+ int64_t i, queues;
+ Error *err = NULL;
+ AFXDPState *s;
+
+ ifindex = if_nametoindex(opts->ifname);
+ if (!ifindex) {
+ error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
+ opts->ifname);
+ return -1;
+ }
+
+ queues = opts->has_queues ? opts->queues : 1;
+ if (queues < 1) {
+ error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
+ queues, opts->ifname);
+ return -1;
+ }
+
+ if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
+ error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
+ return -1;
+ }
+
+ if (opts->sock_fds) {
+ sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
+ if (!sock_fds) {
+ return -1;
+ }
+ }
+
+ for (i = 0; i < queues; i++) {
+ nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
+ qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
+ nc->queue_index = i;
+
+ if (!nc0) {
+ nc0 = nc;
+ }
+
+ s = DO_UPCAST(AFXDPState, nc, nc);
+
+ pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
+ s->ifindex = ifindex;
+ s->n_queues = queues;
+
+ if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
+ || af_xdp_socket_create(s, opts, errp)) {
+ /* Make sure the XDP program will be removed. */
+ s->n_queues = i;
+ error_propagate(errp, err);
+ goto err;
+ }
+ }
+
+ if (nc0) {
+ s = DO_UPCAST(AFXDPState, nc, nc0);
+ if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
+ error_setg_errno(errp, errno,
+ "no XDP program loaded on '%s', ifindex: %d",
+ s->ifname, s->ifindex);
+ goto err;
+ }
+ }
+
+ af_xdp_read_poll(s, true); /* Initially only poll for reads. */
+
+ return 0;
+
+err:
+ g_free(sock_fds);
+ if (nc0) {
+ qemu_del_net_client(nc0);
+ }
+
+ return -1;
+}
@@ -64,6 +64,11 @@ int net_init_netmap(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
#endif
+#ifdef CONFIG_AF_XDP
+int net_init_af_xdp(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+#endif
+
int net_init_vhost_user(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
@@ -36,6 +36,9 @@ system_ss.add(when: vde, if_true: files('vde.c'))
if have_netmap
system_ss.add(files('netmap.c'))
endif
+
+system_ss.add(when: libxdp, if_true: files('af-xdp.c'))
+
if have_vhost_net_user
system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c'))
system_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-user-stub.c'))
@@ -1091,6 +1091,9 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
#ifdef CONFIG_NETMAP
[NET_CLIENT_DRIVER_NETMAP] = net_init_netmap,
#endif
+#ifdef CONFIG_AF_XDP
+ [NET_CLIENT_DRIVER_AF_XDP] = net_init_af_xdp,
+#endif
#ifdef CONFIG_NET_BRIDGE
[NET_CLIENT_DRIVER_BRIDGE] = net_init_bridge,
#endif
@@ -1195,6 +1198,9 @@ void show_netdevs(void)
#ifdef CONFIG_NETMAP
"netmap",
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp",
+#endif
#ifdef CONFIG_POSIX
"vhost-user",
#endif
@@ -409,6 +409,60 @@
'*devname': 'str' } }
##
+# @AFXDPMode:
+#
+# Attach mode for a default XDP program
+#
+# @skb: generic mode, no driver support necessary
+#
+# @native: DRV mode, program is attached to a driver, packets are passed to
+# the socket without allocation of skb.
+#
+# Since: 8.2
+##
+{ 'enum': 'AFXDPMode',
+ 'data': [ 'native', 'skb' ],
+ 'if': 'CONFIG_AF_XDP' }
+
+##
+# @NetdevAFXDPOptions:
+#
+# AF_XDP network backend
+#
+# @ifname: The name of an existing network interface.
+#
+# @mode: Attach mode for a default XDP program. If not specified, then
+# 'native' will be tried first, then 'skb'.
+#
+# @force-copy: Force XDP copy mode even if device supports zero-copy.
+# (default: false)
+#
+# @queues: number of queues to be used for multiqueue interfaces (default: 1).
+#
+# @start-queue: Use @queues starting from this queue number (default: 0).
+#
+# @inhibit: Don't load a default XDP program, use one already loaded to
+# the interface (default: false). Requires @sock-fds.
+#
+# @sock-fds: A colon (:) separated list of file descriptors for already open
+# but not bound AF_XDP sockets in the queue order. One fd per queue.
+# These descriptors should already be added into XDP socket map for
+# corresponding queues. Requires @inhibit.
+#
+# Since: 8.2
+##
+{ 'struct': 'NetdevAFXDPOptions',
+ 'data': {
+ 'ifname': 'str',
+ '*mode': 'AFXDPMode',
+ '*force-copy': 'bool',
+ '*queues': 'int',
+ '*start-queue': 'int',
+ '*inhibit': 'bool',
+ '*sock-fds': 'str' },
+ 'if': 'CONFIG_AF_XDP' }
+
+##
# @NetdevVhostUserOptions:
#
# Vhost-user network backend
@@ -642,6 +696,7 @@
# @vmnet-bridged: since 7.1
# @stream: since 7.2
# @dgram: since 7.2
+# @af-xdp: since 8.2
#
# Since: 2.7
##
@@ -649,6 +704,7 @@
'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'stream',
'dgram', 'vde', 'bridge', 'hubport', 'netmap', 'vhost-user',
'vhost-vdpa',
+ { 'name': 'af-xdp', 'if': 'CONFIG_AF_XDP' },
{ 'name': 'vmnet-host', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-shared', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-bridged', 'if': 'CONFIG_VMNET' }] }
@@ -679,6 +735,8 @@
'bridge': 'NetdevBridgeOptions',
'hubport': 'NetdevHubPortOptions',
'netmap': 'NetdevNetmapOptions',
+ 'af-xdp': { 'type': 'NetdevAFXDPOptions',
+ 'if': 'CONFIG_AF_XDP' },
'vhost-user': 'NetdevVhostUserOptions',
'vhost-vdpa': 'NetdevVhostVDPAOptions',
'vmnet-host': { 'type': 'NetdevVmnetHostOptions',
@@ -2882,6 +2882,19 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
" VALE port (created on the fly) called 'name' ('nmname' is name of the \n"
" netmap device, defaults to '/dev/netmap')\n"
#endif
+#ifdef CONFIG_AF_XDP
+ "-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off]\n"
+ " [,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]\n"
+ " attach to the existing network interface 'name' with AF_XDP socket\n"
+ " use 'mode=MODE' to specify an XDP program attach mode\n"
+ " use 'force-copy=on|off' to force XDP copy mode even if device supports zero-copy (default: off)\n"
+ " use 'inhibit=on|off' to inhibit loading of a default XDP program (default: off)\n"
+ " with inhibit=on,\n"
+ " use 'sock-fds' to provide file descriptors for already open AF_XDP sockets\n"
+ " added to a socket map in XDP program. One socket per queue.\n"
+ " use 'queues=n' to specify how many queues of a multiqueue interface should be used\n"
+ " use 'start-queue=m' to specify the first queue that should be used\n"
+#endif
#ifdef CONFIG_POSIX
"-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
" configure a vhost-user network, backed by a chardev 'dev'\n"
@@ -2927,6 +2940,9 @@ DEF("nic", HAS_ARG, QEMU_OPTION_nic,
#ifdef CONFIG_NETMAP
"netmap|"
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp|"
+#endif
#ifdef CONFIG_POSIX
"vhost-user|"
#endif
@@ -2955,6 +2971,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
#ifdef CONFIG_NETMAP
"netmap|"
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp|"
+#endif
#ifdef CONFIG_VMNET
"vmnet-host|vmnet-shared|vmnet-bridged|"
#endif
@@ -2962,7 +2981,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
" old way to initialize a host network interface\n"
" (use the -netdev option if possible instead)\n", QEMU_ARCH_ALL)
SRST
-``-nic [tap|bridge|user|l2tpv3|vde|netmap|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
+``-nic [tap|bridge|user|l2tpv3|vde|netmap|af-xdp|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
This option is a shortcut for configuring both the on-board
(default) guest NIC hardware and the host network backend in one go.
The host backend options are the same as with the corresponding
@@ -3376,6 +3395,55 @@ SRST
# launch QEMU instance
|qemu_system| linux.img -nic vde,sock=/tmp/myswitch
+``-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off][,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]``
+ Configure AF_XDP backend to connect to a network interface 'name'
+ using AF_XDP socket. A specific program attach mode for a default
+ XDP program can be forced with 'mode', defaults to best-effort,
+ where the likely most performant mode will be in use. Number of queues
+ 'n' should generally match the number or queues in the interface,
+ defaults to 1. Traffic arriving on non-configured device queues will
+ not be delivered to the network backend.
+
+ .. parsed-literal::
+
+ # set number of queues to 4
+ ethtool -L eth0 combined 4
+ # launch QEMU instance
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=4
+
+ 'start-queue' option can be specified if a particular range of queues
+ [m, m + n] should be in use. For example, this is may be necessary in
+ order to use certain NICs in native mode. Kernel allows the driver to
+ create a separate set of XDP queues on top of regular ones, and only
+ these queues can be used for AF_XDP sockets. NICs that work this way
+ may also require an additional traffic redirection with ethtool to these
+ special queues.
+
+ .. parsed-literal::
+
+ # set number of queues to 1
+ ethtool -L eth0 combined 1
+ # redirect all the traffic to the second queue (id: 1)
+ # note: drivers may require non-empty key/mask pair.
+ ethtool -N eth0 flow-type ether \\
+ dst 00:00:00:00:00:00 m FF:FF:FF:FF:FF:FE action 1
+ ethtool -N eth0 flow-type ether \\
+ dst 00:00:00:00:00:01 m FF:FF:FF:FF:FF:FE action 1
+ # launch QEMU instance
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=1,start-queue=1
+
+ XDP program can also be loaded externally. In this case 'inhibit' option
+ should be set to 'on' and 'sock-fds' provided with file descriptors for
+ already open but not bound XDP sockets already added to a socket map for
+ corresponding queues. One socket per queue.
+
+ .. parsed-literal::
+
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=3,inhibit=on,sock-fds=15:16:17
+
``-netdev vhost-user,chardev=id[,vhostforce=on|off][,queues=n]``
Establish a vhost-user netdev, backed by a chardev id. The chardev
should be a unix domain socket backed one. The vhost-user uses a
@@ -35,6 +35,7 @@
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
--tls-priority=@QEMU,SYSTEM \
+--disable-af-xdp \
--disable-attr \
--disable-auth-pam \
--disable-avx2 \
@@ -76,6 +76,7 @@ meson_options_help() {
printf "%s\n" 'disabled with --disable-FEATURE, default is enabled if available'
printf "%s\n" '(unless built with --without-default-features):'
printf "%s\n" ''
+ printf "%s\n" ' af-xdp AF_XDP network backend support'
printf "%s\n" ' alsa ALSA sound support'
printf "%s\n" ' attr attr/xattr support'
printf "%s\n" ' auth-pam PAM access control'
@@ -208,6 +209,8 @@ meson_options_help() {
}
_meson_option_parse() {
case $1 in
+ --enable-af-xdp) printf "%s" -Daf_xdp=enabled ;;
+ --disable-af-xdp) printf "%s" -Daf_xdp=disabled ;;
--enable-alsa) printf "%s" -Dalsa=enabled ;;
--disable-alsa) printf "%s" -Dalsa=disabled ;;
--enable-attr) printf "%s" -Dattr=enabled ;;
@@ -59,6 +59,7 @@ RUN apk update && \
libtasn1-dev \
liburing-dev \
libusb-dev \
+ libxdp-dev \
linux-pam-dev \
llvm \
lttng-ust-dev \
@@ -75,6 +75,7 @@ RUN dnf distro-sync -y && \
libubsan \
liburing-devel \
libusbx-devel \
+ libxdp-devel \
libzstd-devel \
llvm \
lttng-ust-devel \
@@ -82,6 +82,7 @@ exec "$@"\n' > /usr/bin/nosync && \
libubsan \
liburing-devel \
libusbx-devel \
+ libxdp-devel \
libzstd-devel \
llvm \
lttng-ust-devel \
@@ -69,6 +69,7 @@ packages:
- liburing
- libusbx
- libvdeplug
+ - libxdp
- libzstd
- llvm
- lttng-ust