From patchwork Thu May 2 08:39:17 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Magnus Karlsson X-Patchwork-Id: 1094085 X-Patchwork-Delegate: bpf@iogearbox.net Return-Path: X-Original-To: patchwork-incoming-netdev@ozlabs.org Delivered-To: patchwork-incoming-netdev@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=netdev-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=intel.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 44vpbk3hHxz9sBV for ; Thu, 2 May 2019 18:39:46 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726439AbfEBIjp (ORCPT ); Thu, 2 May 2019 04:39:45 -0400 Received: from mga02.intel.com ([134.134.136.20]:64457 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1725951AbfEBIjo (ORCPT ); Thu, 2 May 2019 04:39:44 -0400 X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga101.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 02 May 2019 01:39:44 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.60,421,1549958400"; d="scan'208";a="296322386" Received: from mkarlsso-mobl.ger.corp.intel.com (HELO VM.isw.intel.com) ([10.103.211.43]) by orsmga004.jf.intel.com with ESMTP; 02 May 2019 01:39:40 -0700 From: Magnus Karlsson To: magnus.karlsson@intel.com, bjorn.topel@intel.com, ast@kernel.org, daniel@iogearbox.net, netdev@vger.kernel.org, brouer@redhat.com Cc: bpf@vger.kernel.org, bruce.richardson@intel.com, ciara.loftus@intel.com, jakub.kicinski@netronome.com, xiaolong.ye@intel.com, qi.z.zhang@intel.com, maximmi@mellanox.com, sridhar.samudrala@intel.com, kevin.laatz@intel.com Subject: [RFC bpf-next 1/7] net: fs: make busy poll budget configurable in napi_busy_loop Date: Thu, 2 May 2019 10:39:17 +0200 Message-Id: <1556786363-28743-2-git-send-email-magnus.karlsson@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> References: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org This patch adds the possibility to set the busy poll budget to something else than 8 in napi_busy_loop. All the current users of napi_busy_loop will still have a budget of 8, but the for the XDP socket busy poll support, we need to have a configurable budget that is usually larger since each packet requires less processing than with an AF_INET socket. Signed-off-by: Magnus Karlsson --- fs/eventpoll.c | 5 ++++- include/net/busy_poll.h | 7 +++++-- net/core/dev.c | 21 ++++++++++----------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4a0e98d..0fbbc35 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -394,6 +394,8 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) return ep_events_available(ep) || busy_loop_timeout(start_time); } +#define BUSY_POLL_BUDGET 8 + /* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. @@ -405,7 +407,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, + BUSY_POLL_BUDGET); } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index ba61cdd..94817e8 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -55,7 +55,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg); + void *loop_end_arg, int budget); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -111,13 +111,16 @@ static inline bool sk_busy_loop_timeout(struct sock *sk, return true; } +#define BUSY_POLL_BUDGET 8 + static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) - napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, + BUSY_POLL_BUDGET); #endif } diff --git a/net/core/dev.c b/net/core/dev.c index 22f2640..e82fc44 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6108,9 +6108,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 - -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + int budget) { int rc; @@ -6131,17 +6130,17 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) + if (rc == budget) __napi_schedule(napi); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, int budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6178,8 +6177,8 @@ void napi_busy_loop(unsigned int napi_id, have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), @@ -6191,7 +6190,7 @@ void napi_busy_loop(unsigned int napi_id, if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6202,7 +6201,7 @@ void napi_busy_loop(unsigned int napi_id, cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, budget); preempt_enable(); out: rcu_read_unlock(); From patchwork Thu May 2 08:39:18 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Magnus Karlsson X-Patchwork-Id: 1094087 X-Patchwork-Delegate: bpf@iogearbox.net Return-Path: X-Original-To: incoming-bpf@patchwork.ozlabs.org Delivered-To: patchwork-incoming-bpf@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=bpf-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=intel.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 44vpbn1d9Wz9sBV for ; Thu, 2 May 2019 18:39:49 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1725951AbfEBIjt (ORCPT ); Thu, 2 May 2019 04:39:49 -0400 Received: from mga02.intel.com ([134.134.136.20]:64457 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1725905AbfEBIjs (ORCPT ); Thu, 2 May 2019 04:39:48 -0400 X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga101.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 02 May 2019 01:39:48 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.60,421,1549958400"; d="scan'208";a="296322403" Received: from mkarlsso-mobl.ger.corp.intel.com (HELO VM.isw.intel.com) ([10.103.211.43]) by orsmga004.jf.intel.com with ESMTP; 02 May 2019 01:39:44 -0700 From: Magnus Karlsson To: magnus.karlsson@intel.com, bjorn.topel@intel.com, ast@kernel.org, daniel@iogearbox.net, netdev@vger.kernel.org, brouer@redhat.com Cc: bpf@vger.kernel.org, bruce.richardson@intel.com, ciara.loftus@intel.com, jakub.kicinski@netronome.com, xiaolong.ye@intel.com, qi.z.zhang@intel.com, maximmi@mellanox.com, sridhar.samudrala@intel.com, kevin.laatz@intel.com Subject: [RFC bpf-next 2/7] net: i40e: ixgbe: tun: veth: virtio-net: centralize xdp_rxq_info and add napi id Date: Thu, 2 May 2019 10:39:18 +0200 Message-Id: <1556786363-28743-3-git-send-email-magnus.karlsson@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> References: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> Sender: bpf-owner@vger.kernel.org Precedence: bulk List-Id: netdev.vger.kernel.org This patch centralizes the xdp_rxq_info struct to only reside in a single place and adds napi id to the information contained in it. The reason to add napi id is that it is needed for the AF_XDP busy poll support. The xsk code needs to know what napi id to call when it gets a poll request on a socket that is bound to a specific queue id on a netdev. Previously, the xdp_req_info struct resided both in the _rx structure and in the driver. The one in the _rx structure was used for the XDP_SKB case and the one in the driver for the XDP_DRV case. With busy-poll, the request to execute the napi context always comes from the syscall path, never the driver path, so the xdp_rxq_info needs to reside in the _rx struct for both XDP_SKB and XDP_DRV. With this, there is no longer a need to have an extra copy in the driver that is only valid for the XDP_DRV case. This structure has been converted to a pointer reference to the xdp_rxq_info struct in the kernel instead, making the code smaller and simpler. NOTE: this patch needs to include moving over all drivers to the new interface. I only did a handful here to demonstrate the changes. When we agree on how to do it, I will move over all of them. Signed-off-by: Magnus Karlsson --- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 2 - drivers/net/ethernet/intel/i40e/i40e_main.c | 8 ++-- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 16 +++++--- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 2 +- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 36 +++++++++++------- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 2 +- drivers/net/tun.c | 14 +++---- drivers/net/veth.c | 10 ++--- drivers/net/virtio_net.c | 8 ++-- include/net/xdp.h | 13 ++++--- net/core/dev.c | 19 +--------- net/core/xdp.c | 51 +++++++++++++++++--------- 14 files changed, 102 insertions(+), 83 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 9eaea1b..dcb5144 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -2006,8 +2006,6 @@ static int i40e_set_ringparam(struct net_device *netdev, */ rx_rings[i].desc = NULL; rx_rings[i].rx_bi = NULL; - /* Clear cloned XDP RX-queue info before setup call */ - memset(&rx_rings[i].xdp_rxq, 0, sizeof(rx_rings[i].xdp_rxq)); /* this is to allow wr32 to have something to write to * during early allocation of Rx buffers */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 65c2b9d..763c48c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3238,7 +3238,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) memset(&rx_ctx, 0, sizeof(rx_ctx)); if (ring->vsi->type == I40E_VSI_MAIN) - xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + xdp_rxq_info_unreg_mem_model(ring->netdev, ring->queue_index); ring->xsk_umem = i40e_xsk_umem(ring); if (ring->xsk_umem) { @@ -3250,7 +3250,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) */ chain_len = 1; ring->zca.free = i40e_zca_free; - ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + ret = xdp_rxq_info_reg_mem_model(ring->netdev, + ring->queue_index, MEM_TYPE_ZERO_COPY, &ring->zca); if (ret) @@ -3262,7 +3263,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) } else { ring->rx_buf_len = vsi->rx_buf_len; if (ring->vsi->type == I40E_VSI_MAIN) { - ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + ret = xdp_rxq_info_reg_mem_model(ring->netdev, + ring->queue_index, MEM_TYPE_PAGE_SHARED, NULL); if (ret) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index e193170..74132ad 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1408,8 +1408,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) void i40e_free_rx_resources(struct i40e_ring *rx_ring) { i40e_clean_rx_ring(rx_ring); - if (rx_ring->vsi->type == I40E_VSI_MAIN) - xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + if (rx_ring->vsi->type == I40E_VSI_MAIN) { + xdp_rxq_info_unreg(rx_ring->vsi->netdev, rx_ring->queue_index); + rx_ring->xdp_rxq = NULL; + } rx_ring->xdp_prog = NULL; kfree(rx_ring->rx_bi); rx_ring->rx_bi = NULL; @@ -1460,15 +1462,19 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) /* XDP RX-queue info only needed for RX rings exposed to XDP */ if (rx_ring->vsi->type == I40E_VSI_MAIN) { - err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index); + err = xdp_rxq_info_reg(rx_ring->netdev, rx_ring->queue_index, + rx_ring->q_vector->napi.napi_id); if (err < 0) goto err; + + rx_ring->xdp_rxq = xdp_rxq_info_get(rx_ring->netdev, + rx_ring->queue_index); } rx_ring->xdp_prog = rx_ring->vsi->xdp_prog; return 0; + err: kfree(rx_ring->rx_bi); rx_ring->rx_bi = NULL; @@ -2335,7 +2341,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) bool failure = false; struct xdp_buff xdp; - xdp.rxq = &rx_ring->xdp_rxq; + xdp.rxq = rx_ring->xdp_rxq; while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *rx_buffer; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 100e92d..066f616 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -417,7 +417,7 @@ struct i40e_ring { */ struct i40e_channel *ch; - struct xdp_rxq_info xdp_rxq; + struct xdp_rxq_info *xdp_rxq; struct xdp_umem *xsk_umem; struct zero_copy_allocator zca; /* ZC allocator anchor */ } ____cacheline_internodealigned_in_smp; diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 1b17486..2eba2bc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -536,7 +536,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) struct sk_buff *skb; struct xdp_buff xdp; - xdp.rxq = &rx_ring->xdp_rxq; + xdp.rxq = rx_ring->xdp_rxq; while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *bi; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 08d85e3..ea320b9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -351,7 +351,7 @@ struct ixgbe_ring { struct ixgbe_tx_queue_stats tx_stats; struct ixgbe_rx_queue_stats rx_stats; }; - struct xdp_rxq_info xdp_rxq; + struct xdp_rxq_info *xdp_rxq; struct xdp_umem *xsk_umem; struct zero_copy_allocator zca; /* ZC allocator anchor */ u16 ring_idx; /* {rx,tx,xdp}_ring back reference idx */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 7b90320..3afb521 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2285,7 +2285,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, unsigned int xdp_xmit = 0; struct xdp_buff xdp; - xdp.rxq = &rx_ring->xdp_rxq; + xdp.rxq = rx_ring->xdp_rxq; while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; @@ -4066,17 +4066,19 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter, u32 rxdctl; u8 reg_idx = ring->reg_idx; - xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + xdp_rxq_info_unreg_mem_model(ring->netdev, ring->queue_index); ring->xsk_umem = ixgbe_xsk_umem(adapter, ring); if (ring->xsk_umem) { ring->zca.free = ixgbe_zca_free; - WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + (void)xdp_rxq_info_reg_mem_model(ring->netdev, + ring->queue_index, MEM_TYPE_ZERO_COPY, - &ring->zca)); + &ring->zca); } else { - WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_PAGE_SHARED, NULL)); + (void)xdp_rxq_info_reg_mem_model(ring->netdev, + ring->queue_index, + MEM_TYPE_PAGE_SHARED, NULL); } /* disable queue to avoid use of these values while updating state */ @@ -6514,6 +6516,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, struct device *dev = rx_ring->dev; int orig_node = dev_to_node(dev); int ring_node = NUMA_NO_NODE; + int err = -ENOMEM; int size; size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; @@ -6527,6 +6530,14 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, if (!rx_ring->rx_buffer_info) goto err; + /* XDP RX-queue info */ + err = xdp_rxq_info_reg(adapter->netdev, rx_ring->queue_index, + rx_ring->q_vector->napi.napi_id); + if (err) + goto err; + rx_ring->xdp_rxq = xdp_rxq_info_get(rx_ring->netdev, + rx_ring->queue_index); + /* Round up to nearest 4K */ rx_ring->size = rx_ring->count * sizeof(union ixgbe_adv_rx_desc); rx_ring->size = ALIGN(rx_ring->size, 4096); @@ -6540,17 +6551,14 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, if (!rx_ring->desc) rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size, &rx_ring->dma, GFP_KERNEL); - if (!rx_ring->desc) + if (!rx_ring->desc) { + err = -ENOMEM; goto err; + } rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; - /* XDP RX-queue info */ - if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, - rx_ring->queue_index) < 0) - goto err; - rx_ring->xdp_prog = adapter->xdp_prog; return 0; @@ -6558,7 +6566,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, vfree(rx_ring->rx_buffer_info); rx_ring->rx_buffer_info = NULL; dev_err(dev, "Unable to allocate memory for the Rx descriptor ring\n"); - return -ENOMEM; + return err; } /** @@ -6648,7 +6656,7 @@ void ixgbe_free_rx_resources(struct ixgbe_ring *rx_ring) ixgbe_clean_rx_ring(rx_ring); rx_ring->xdp_prog = NULL; - xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + xdp_rxq_info_unreg(rx_ring->netdev, rx_ring->queue_index); vfree(rx_ring->rx_buffer_info); rx_ring->rx_buffer_info = NULL; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index bfe95ce..9c10c93 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -487,7 +487,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, struct sk_buff *skb; struct xdp_buff xdp; - xdp.rxq = &rx_ring->xdp_rxq; + xdp.rxq = rx_ring->xdp_rxq; while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9d72f8c..b05c239 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -726,7 +726,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean) unregister_netdevice(tun->dev); } if (tun) - xdp_rxq_info_unreg(&tfile->xdp_rxq); + xdp_rxq_info_unreg(tun->dev, tfile->queue_index); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); sock_put(&tfile->sk); } @@ -774,13 +774,13 @@ static void tun_detach_all(struct net_device *dev) tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); - xdp_rxq_info_unreg(&tfile->xdp_rxq); + xdp_rxq_info_unreg(dev, tfile->queue_index); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_enable_queue(tfile); tun_queue_purge(tfile); - xdp_rxq_info_unreg(&tfile->xdp_rxq); + xdp_rxq_info_unreg(dev, tfile->queue_index); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); @@ -842,14 +842,14 @@ static int tun_attach(struct tun_struct *tun, struct file *file, tfile->xdp_rxq.queue_index = tfile->queue_index; } else { /* Setup XDP RX-queue info, for new tfile getting attached */ - err = xdp_rxq_info_reg(&tfile->xdp_rxq, - tun->dev, tfile->queue_index); + err = xdp_rxq_info_reg(tun->dev, tfile->queue_index, + tfile->napi.napi_id); if (err < 0) goto out; - err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, + err = xdp_rxq_info_reg_mem_model(dev, tfile->queue_index, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { - xdp_rxq_info_unreg(&tfile->xdp_rxq); + xdp_rxq_info_unreg(dev, tfile->queue_index); goto out; } err = 0; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 09a1433..a5bc608a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -817,11 +817,11 @@ static int veth_enable_xdp(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) { struct veth_rq *rq = &priv->rq[i]; - err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); + err = xdp_rxq_info_reg(dev, i, rq->xdp_napi.napi_id); if (err < 0) goto err_rxq_reg; - err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + err = xdp_rxq_info_reg_mem_model(dev, i, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) @@ -841,10 +841,10 @@ static int veth_enable_xdp(struct net_device *dev) return 0; err_reg_mem: - xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); + xdp_rxq_info_unreg(dev, i); err_rxq_reg: for (i--; i >= 0; i--) - xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); + xdp_rxq_info_unreg(dev, i); return err; } @@ -861,7 +861,7 @@ static void veth_disable_xdp(struct net_device *dev) struct veth_rq *rq = &priv->rq[i]; rq->xdp_rxq.mem = rq->xdp_mem; - xdp_rxq_info_unreg(&rq->xdp_rxq); + xdp_rxq_info_unreg(dev, i); } } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 559c48e6..f15b3d5 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1469,14 +1469,14 @@ static int virtnet_open(struct net_device *dev) if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); - err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(dev, i, vi->rq[i].napi.napi_id); if (err < 0) return err; - err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq, + err = xdp_rxq_info_reg_mem_model(dev, i, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { - xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); + xdp_rxq_info_unreg(dev, i); return err; } @@ -1817,7 +1817,7 @@ static int virtnet_close(struct net_device *dev) cancel_delayed_work_sync(&vi->refill); for (i = 0; i < vi->max_queue_pairs; i++) { - xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); + xdp_rxq_info_unreg(dev, i); napi_disable(&vi->rq[i].napi); virtnet_napi_tx_disable(&vi->sq[i].napi); } diff --git a/include/net/xdp.h b/include/net/xdp.h index 0f25b36..d5fb5c0 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -60,6 +60,7 @@ struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; + unsigned int napi_id; struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ @@ -129,14 +130,16 @@ void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index); -void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); +void xdp_rxq_info_init(struct net_device *dev, u32 queue_index); +int xdp_rxq_info_reg(struct net_device *dev, u32 queue_index, + unsigned int napi_id); +void xdp_rxq_info_unreg(struct net_device *net, u32 queue_index); +struct xdp_rxq_info *xdp_rxq_info_get(struct net_device *dev, u32 queue_index); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); -int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, +int xdp_rxq_info_reg_mem_model(struct net_device *dev, u32 queue_index, enum xdp_mem_type type, void *allocator); -void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); +void xdp_rxq_info_unreg_mem_model(struct net_device *dev, u32 queue_index); /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. diff --git a/net/core/dev.c b/net/core/dev.c index e82fc44..0d6b3ed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4383,6 +4383,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, rxqueue = netif_get_rxqueue(skb); xdp->rxq = &rxqueue->xdp_rxq; + xdp->rxq->napi_id = skb->napi_id; act = bpf_prog_run_xdp(xdp_prog, xdp); @@ -8530,7 +8531,6 @@ static int netif_alloc_rx_queues(struct net_device *dev) unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); - int err = 0; BUG_ON(count < 1); @@ -8544,32 +8544,17 @@ static int netif_alloc_rx_queues(struct net_device *dev) rx[i].dev = dev; /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); - if (err < 0) - goto err_rxq_info; + xdp_rxq_info_init(dev, i); } return 0; - -err_rxq_info: - /* Rollback successful reg's and free other resources */ - while (i--) - xdp_rxq_info_unreg(&rx[i].xdp_rxq); - kvfree(dev->_rx); - dev->_rx = NULL; - return err; } static void netif_free_rx_queues(struct net_device *dev) { - unsigned int i, count = dev->num_rx_queues; - /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ if (!dev->_rx) return; - for (i = 0; i < count; i++) - xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); - kvfree(dev->_rx); } diff --git a/net/core/xdp.c b/net/core/xdp.c index 4b2b194..ed691f9 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -94,8 +94,9 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) kfree(xa); } -void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +void xdp_rxq_info_unreg_mem_model(struct net_device *dev, u32 queue_index) { + struct xdp_rxq_info *xdp_rxq = &dev->_rx[queue_index].xdp_rxq; struct xdp_mem_allocator *xa; int id = xdp_rxq->mem.id; @@ -122,18 +123,33 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); -void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +static void _xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) { + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +void xdp_rxq_info_init(struct net_device *dev, u32 queue_index) +{ + struct xdp_rxq_info *xdp_rxq = &dev->_rx[queue_index].xdp_rxq; + + _xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; +} + +void xdp_rxq_info_unreg(struct net_device *dev, u32 queue_index) +{ + struct xdp_rxq_info *xdp_rxq = &dev->_rx[queue_index].xdp_rxq; + /* Simplify driver cleanup code paths, allow unreg "unused" */ if (xdp_rxq->reg_state == REG_STATE_UNUSED) return; WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); - xdp_rxq_info_unreg_mem_model(xdp_rxq); + xdp_rxq_info_unreg_mem_model(dev, queue_index); xdp_rxq->reg_state = REG_STATE_UNREGISTERED; - xdp_rxq->dev = NULL; /* Reset mem info to defaults */ xdp_rxq->mem.id = 0; @@ -141,15 +157,12 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); -static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) -{ - memset(xdp_rxq, 0, sizeof(*xdp_rxq)); -} - /* Returns 0 on success, negative on failure */ -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index) +int xdp_rxq_info_reg(struct net_device *dev, u32 queue_index, + unsigned int napi_id) { + struct xdp_rxq_info *xdp_rxq = &dev->_rx[queue_index].xdp_rxq; + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); return -EINVAL; @@ -157,7 +170,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { WARN(1, "Missing unregister, handled but fix driver"); - xdp_rxq_info_unreg(xdp_rxq); + xdp_rxq_info_unreg(dev, queue_index); } if (!dev) { @@ -166,15 +179,18 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, } /* State either UNREGISTERED or NEW */ - xdp_rxq_info_init(xdp_rxq); - xdp_rxq->dev = dev; - xdp_rxq->queue_index = queue_index; - + xdp_rxq->napi_id = napi_id; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); +struct xdp_rxq_info *xdp_rxq_info_get(struct net_device *dev, u32 queue_index) +{ + return &dev->_rx[queue_index].xdp_rxq; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_get); + void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) { xdp_rxq->reg_state = REG_STATE_UNUSED; @@ -249,9 +265,10 @@ static bool __is_supported_mem_type(enum xdp_mem_type type) return true; } -int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, +int xdp_rxq_info_reg_mem_model(struct net_device *dev, u32 queue_index, enum xdp_mem_type type, void *allocator) { + struct xdp_rxq_info *xdp_rxq = &dev->_rx[queue_index].xdp_rxq; struct xdp_mem_allocator *xdp_alloc; gfp_t gfp = GFP_KERNEL; int id, errno, ret; From patchwork Thu May 2 08:39:20 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Magnus Karlsson X-Patchwork-Id: 1094089 X-Patchwork-Delegate: bpf@iogearbox.net Return-Path: X-Original-To: incoming-bpf@patchwork.ozlabs.org Delivered-To: patchwork-incoming-bpf@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=bpf-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=intel.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 44vpbx0lMDz9sBV for ; Thu, 2 May 2019 18:39:57 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726486AbfEBIj4 (ORCPT ); Thu, 2 May 2019 04:39:56 -0400 Received: from mga02.intel.com ([134.134.136.20]:64537 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726482AbfEBIj4 (ORCPT ); Thu, 2 May 2019 04:39:56 -0400 X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga101.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 02 May 2019 01:39:56 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.60,421,1549958400"; d="scan'208";a="296322421" Received: from mkarlsso-mobl.ger.corp.intel.com (HELO VM.isw.intel.com) ([10.103.211.43]) by orsmga004.jf.intel.com with ESMTP; 02 May 2019 01:39:52 -0700 From: Magnus Karlsson To: magnus.karlsson@intel.com, bjorn.topel@intel.com, ast@kernel.org, daniel@iogearbox.net, netdev@vger.kernel.org, brouer@redhat.com Cc: bpf@vger.kernel.org, bruce.richardson@intel.com, ciara.loftus@intel.com, jakub.kicinski@netronome.com, xiaolong.ye@intel.com, qi.z.zhang@intel.com, maximmi@mellanox.com, sridhar.samudrala@intel.com, kevin.laatz@intel.com Subject: [RFC bpf-next 4/7] netdevice: introduce busy-poll setsockopt for AF_XDP Date: Thu, 2 May 2019 10:39:20 +0200 Message-Id: <1556786363-28743-5-git-send-email-magnus.karlsson@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> References: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> Sender: bpf-owner@vger.kernel.org Precedence: bulk List-Id: netdev.vger.kernel.org This patch introduces a new setsockopt that enables busy-poll for XDP sockets. It is called XDP_BUSY_POLL_BATCH_SIZE and takes batch size as an argument. A value between 1 and NAPI_WEIGHT (64) will turn it on, 0 will turn it off and any other value will return an error. There is also a corresponding getsockopt implementation. Signed-off-by: Magnus Karlsson --- include/uapi/linux/if_xdp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index caed8b1..be28a78 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -46,6 +46,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_FILL_RING 5 #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 +#define XDP_BUSY_POLL_BATCH_SIZE 8 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ From patchwork Thu May 2 08:39:21 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Magnus Karlsson X-Patchwork-Id: 1094097 X-Patchwork-Delegate: bpf@iogearbox.net Return-Path: X-Original-To: incoming-bpf@patchwork.ozlabs.org Delivered-To: patchwork-incoming-bpf@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=bpf-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=intel.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 44vphg6k14z9sBV for ; Thu, 2 May 2019 18:44:03 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1725951AbfEBIoD (ORCPT ); Thu, 2 May 2019 04:44:03 -0400 Received: from mga02.intel.com ([134.134.136.20]:64537 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726482AbfEBIoC (ORCPT ); Thu, 2 May 2019 04:44:02 -0400 X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga101.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 02 May 2019 01:40:00 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.60,421,1549958400"; d="scan'208";a="296322445" Received: from mkarlsso-mobl.ger.corp.intel.com (HELO VM.isw.intel.com) ([10.103.211.43]) by orsmga004.jf.intel.com with ESMTP; 02 May 2019 01:39:56 -0700 From: Magnus Karlsson To: magnus.karlsson@intel.com, bjorn.topel@intel.com, ast@kernel.org, daniel@iogearbox.net, netdev@vger.kernel.org, brouer@redhat.com Cc: bpf@vger.kernel.org, bruce.richardson@intel.com, ciara.loftus@intel.com, jakub.kicinski@netronome.com, xiaolong.ye@intel.com, qi.z.zhang@intel.com, maximmi@mellanox.com, sridhar.samudrala@intel.com, kevin.laatz@intel.com Subject: [RFC bpf-next 5/7] net: add busy-poll support for XDP sockets Date: Thu, 2 May 2019 10:39:21 +0200 Message-Id: <1556786363-28743-6-git-send-email-magnus.karlsson@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> References: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> Sender: bpf-owner@vger.kernel.org Precedence: bulk List-Id: netdev.vger.kernel.org This patch adds busy-poll support for XDP sockets (AF_XDP). With busy-poll, the driver is executed in process context by calling the poll() syscall. The main advantage with this is that all processing occurs on a single core. This eliminates the core-to-core cache transfers that occur between the application and the softirqd processing on another core, that occurs without busy-poll. From a systems point of view, it also provides an advatage that we do not have to provision extra cores in the system to handle ksoftirqd/softirq processing, as all processing is done on the single core that executes the application. The drawback of busy-poll is that max throughput seen from a single application will be lower (due to the syscall), but on a per core basis it will often be higher as the normal mode runs on two cores and busy-poll on a single one. The semantics of busy-poll from the application point of view are the following: * The application is required to call poll() to drive rx and tx processing. There is no guarantee that softirq and interrupts will do this for you. * It should be enabled on a per socket basis. No global enablement, i.e. the XDP socket busy-poll will not care about the current /proc/sys/net/core/busy_poll and busy_read global enablement mechanisms. * The batch size (how many packets that are processed every time the napi function in the driver is called, i.e. the weight parameter) should be configurable. Currently, the busy-poll size of AF_INET sockets is set to 8, but for AF_XDP sockets this is too small as the amount of processing per packet is much smaller with AF_XDP. This should be configurable on a per socket basis. * If you put multiple AF_XDP busy-poll enabled sockets into a poll() call the napi contexts of all of them should be executed. This is in contrast to the AF_INET busy-poll that quits after the fist one that finds any packets. We need all napi contexts to be executed due to the first requirement in this list. The behaviour we want is much more like regular sockets in that all of them are checked in the poll call. * Should be possible to mix AF_XDP busy-poll sockets with any other sockets including busy-poll AF_INET ones in a single poll() call without any change to semantics or the behaviour of any of those socket types. * As suggested by Maxim Mikityanskiy, poll() will in the busy-poll mode return POLLERR if the fill ring is empty or the completion queue is full. Busy-poll support is enabled by calling a new setsockopt called XDP_BUSY_POLL_BATCH_SIZE that takes batch size as an argument. A value between 1 and NAPI_WEIGHT (64) will turn it on, 0 will turn it off and any other value will return an error. A typical packet processing rxdrop loop with busy-poll will look something like this: for (i = 0; i < num_socks; i++) { fds[i].fd = xsk_socket__fd(xsks[i]->xsk); fds[i].events = POLLIN; } for (;;) { ret = poll(fds, num_socks, 0); if (ret <= 0) continue; for (i = 0; i < num_socks; i++) rx_drop(xsks[i], fds); /* The actual application */ } Signed-off-by: Magnus Karlsson --- include/net/xdp_sock.h | 3 ++ net/xdp/Kconfig | 1 + net/xdp/xsk.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.h | 18 ++++++-- 4 files changed, 138 insertions(+), 6 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d074b6d..2e956b37 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -57,7 +57,10 @@ struct xdp_sock { struct net_device *dev; struct xdp_umem *umem; struct list_head flush_node; + unsigned int napi_id_rx; + unsigned int napi_id_tx; u16 queue_id; + u16 bp_batch_size; struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head list; bool zc; diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig index 0255b33..219baaa 100644 --- a/net/xdp/Kconfig +++ b/net/xdp/Kconfig @@ -1,6 +1,7 @@ config XDP_SOCKETS bool "XDP sockets" depends on BPF_SYSCALL + select NET_RX_BUSY_POLL default n help XDP sockets allows a channel between XDP programs and diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a14e886..bd3d0fe 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -302,16 +303,107 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } +static unsigned int xsk_check_rx_poll_err(struct xdp_sock *xs) +{ + return xskq_consumer_empty(xs->umem->fq) ? POLLERR : 0; +} + +static unsigned int xsk_check_tx_poll_err(struct xdp_sock *xs) +{ + return xskq_producer_full(xs->umem->cq) ? POLLERR : 0; +} + +static bool xsk_busy_loop_end(void *p, unsigned long start_time) +{ + return true; +} + +static unsigned int xsk_get_napi_id_rx(struct xdp_sock *xs) +{ + if (xs->napi_id_rx) + return xs->napi_id_rx; + if (xs->dev->_rx[xs->queue_id].xdp_rxq.napi_id) { + xs->napi_id_rx = xdp_rxq_info_get(xs->dev, + xs->queue_id)->napi_id; + return xs->napi_id_rx; + } + + WARN_ON_ONCE(true); + return 0; +} + +static unsigned int xsk_get_napi_id_tx(struct xdp_sock *xs) +{ + if (xs->napi_id_tx) + return xs->napi_id_tx; + if (xs->dev->_tx[xs->queue_id].xdp_txq.napi_id) { + xs->napi_id_tx = xdp_txq_info_get(xs->dev, + xs->queue_id)->napi_id; + return xs->napi_id_tx; + } + + WARN_ON_ONCE(true); + return 0; +} + +static void xsk_exec_poll_generic(struct sock *sk, struct xdp_sock *xs, + __poll_t events) +{ + if (events & (POLLIN | POLLRDNORM)) + /* NAPI id filled in by the generic XDP code */ + napi_busy_loop(xsk_get_napi_id_rx(xs), xsk_busy_loop_end, NULL, + xs->bp_batch_size); + if (events & (POLLOUT | POLLWRNORM)) + /* Use the regular send path as we do not have any + * NAPI id for the Tx path. It is only in the driver + * and not communicated upwards in the skb case. + */ + xsk_generic_xmit(sk, NULL, 0); +} + +static void xsk_exec_poll_zc(struct xdp_sock *xs, __poll_t events) +{ + unsigned int napi_id_rx = xsk_get_napi_id_rx(xs); + unsigned int napi_id_tx = xsk_get_napi_id_tx(xs); + + if (events & (POLLIN | POLLRDNORM)) + napi_busy_loop(xs->napi_id_rx, xsk_busy_loop_end, NULL, + xs->bp_batch_size); + if (napi_id_rx != napi_id_tx) + if (events & (POLLOUT | POLLWRNORM)) + /* Tx has its own napi so we need to call it too */ + napi_busy_loop(xs->napi_id_tx, xsk_busy_loop_end, NULL, + xs->bp_batch_size); +} + static unsigned int xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + __poll_t events = poll_requested_events(wait); + + if (xs->bp_batch_size) { + if (xs->zc) + xsk_exec_poll_zc(xs, events); + else + xsk_exec_poll_generic(sk, xs, events); + + if (events & (POLLIN | POLLRDNORM)) + mask |= xsk_check_rx_poll_err(xs); + if (events & (POLLOUT | POLLWRNORM)) + mask |= xsk_check_tx_poll_err(xs); + + /* Clear the busy_loop flag so that any further fds in + * the pollfd struct will have their napis scheduled. + */ + mask &= ~POLL_BUSY_LOOP; + } - if (xs->rx && !xskq_empty_desc(xs->rx)) + if (xs->rx && !xskq_producer_empty(xs->rx)) mask |= POLLIN | POLLRDNORM; - if (xs->tx && !xskq_full_desc(xs->tx)) + if (xs->tx && !xskq_consumer_full(xs->tx)) mask |= POLLOUT | POLLWRNORM; return mask; @@ -572,6 +664,21 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return err; } + case XDP_BUSY_POLL_BATCH_SIZE: + { + u16 batch_size; + + if (copy_from_user(&batch_size, optval, sizeof(batch_size))) + return -EFAULT; + + if (batch_size == 0 || batch_size > NAPI_POLL_WEIGHT) + return -EINVAL; + + mutex_lock(&xs->mutex); + xs->bp_batch_size = batch_size; + mutex_unlock(&xs->mutex); + return 0; + } default: break; } @@ -644,6 +751,17 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, return 0; } + case XDP_BUSY_POLL_BATCH_SIZE: + if (len < sizeof(xs->bp_batch_size)) + return -EINVAL; + + if (copy_to_user(optval, &xs->bp_batch_size, + sizeof(xs->bp_batch_size))) + return -EFAULT; + if (put_user(sizeof(xs->bp_batch_size), optlen)) + return -EFAULT; + + return 0; default: break; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 88b9ae2..ebbd996 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -292,14 +292,24 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) WRITE_ONCE(q->ring->producer, q->prod_tail); } -static inline bool xskq_full_desc(struct xsk_queue *q) +static inline bool xskq_consumer_full(struct xsk_queue *q) { - return xskq_nb_avail(q, q->nentries) == q->nentries; + return READ_ONCE(q->ring->producer) - q->cons_tail == q->nentries; } -static inline bool xskq_empty_desc(struct xsk_queue *q) +static inline bool xskq_producer_empty(struct xsk_queue *q) { - return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; + return READ_ONCE(q->ring->consumer) == q->prod_tail; +} + +static inline bool xskq_consumer_empty(struct xsk_queue *q) +{ + return READ_ONCE(q->ring->producer) == q->cons_tail; +} + +static inline bool xskq_producer_full(struct xsk_queue *q) +{ + return q->prod_tail - READ_ONCE(q->ring->consumer) == q->nentries; } void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); From patchwork Thu May 2 08:39:22 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Magnus Karlsson X-Patchwork-Id: 1094091 X-Patchwork-Delegate: bpf@iogearbox.net Return-Path: X-Original-To: incoming-bpf@patchwork.ozlabs.org Delivered-To: patchwork-incoming-bpf@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=bpf-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=intel.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 44vpc51Hmtz9sBV for ; Thu, 2 May 2019 18:40:05 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726510AbfEBIkE (ORCPT ); Thu, 2 May 2019 04:40:04 -0400 Received: from mga02.intel.com ([134.134.136.20]:64550 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1725944AbfEBIkE (ORCPT ); Thu, 2 May 2019 04:40:04 -0400 X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga101.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 02 May 2019 01:40:04 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.60,421,1549958400"; d="scan'208";a="296322500" Received: from mkarlsso-mobl.ger.corp.intel.com (HELO VM.isw.intel.com) ([10.103.211.43]) by orsmga004.jf.intel.com with ESMTP; 02 May 2019 01:40:00 -0700 From: Magnus Karlsson To: magnus.karlsson@intel.com, bjorn.topel@intel.com, ast@kernel.org, daniel@iogearbox.net, netdev@vger.kernel.org, brouer@redhat.com Cc: bpf@vger.kernel.org, bruce.richardson@intel.com, ciara.loftus@intel.com, jakub.kicinski@netronome.com, xiaolong.ye@intel.com, qi.z.zhang@intel.com, maximmi@mellanox.com, sridhar.samudrala@intel.com, kevin.laatz@intel.com Subject: [RFC bpf-next 6/7] libbpf: add busy-poll support to XDP sockets Date: Thu, 2 May 2019 10:39:22 +0200 Message-Id: <1556786363-28743-7-git-send-email-magnus.karlsson@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> References: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> Sender: bpf-owner@vger.kernel.org Precedence: bulk List-Id: netdev.vger.kernel.org This patch adds busy-poll support for XDP sockets to libbpf. A new option is provided in the xsk_socket_config struct called busy_poll. The value of it is the desired batch size. A value between 1 and NAPI_WEIGHT (64) will turn it on, 0 will turn it off and any other value will return an error. Signed-off-by: Magnus Karlsson --- tools/include/uapi/linux/if_xdp.h | 1 + tools/lib/bpf/xsk.c | 23 +++++++++++++---------- tools/lib/bpf/xsk.h | 1 + 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index caed8b1..be28a78 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -46,6 +46,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_FILL_RING 5 #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 +#define XDP_BUSY_POLL_BATCH_SIZE 8 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 557ef8d..b5538f1 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -120,10 +120,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg, return; } - cfg->fill_size = usr_cfg->fill_size; - cfg->comp_size = usr_cfg->comp_size; - cfg->frame_size = usr_cfg->frame_size; - cfg->frame_headroom = usr_cfg->frame_headroom; + memcpy(cfg, usr_cfg, sizeof(*usr_cfg)); } static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, @@ -135,18 +132,14 @@ static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, cfg->libbpf_flags = 0; cfg->xdp_flags = 0; cfg->bind_flags = 0; + cfg->busy_poll = 0; return 0; } if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD) return -EINVAL; - cfg->rx_size = usr_cfg->rx_size; - cfg->tx_size = usr_cfg->tx_size; - cfg->libbpf_flags = usr_cfg->libbpf_flags; - cfg->xdp_flags = usr_cfg->xdp_flags; - cfg->bind_flags = usr_cfg->bind_flags; - + memcpy(cfg, usr_cfg, sizeof(*usr_cfg)); return 0; } @@ -632,6 +625,16 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, } xsk->tx = tx; + if (xsk->config.busy_poll) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_BUSY_POLL_BATCH_SIZE, + &xsk->config.busy_poll, + sizeof(xsk->config.busy_poll)); + if (err) { + err = -errno; + goto out_mmap_tx; + } + } + sxdp.sxdp_family = PF_XDP; sxdp.sxdp_ifindex = xsk->ifindex; sxdp.sxdp_queue_id = xsk->queue_id; diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index 82ea71a..517a56a 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -187,6 +187,7 @@ struct xsk_socket_config { __u32 libbpf_flags; __u32 xdp_flags; __u16 bind_flags; + __u16 busy_poll; }; /* Set config to NULL to get the default configuration. */ From patchwork Thu May 2 08:39:23 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Magnus Karlsson X-Patchwork-Id: 1094093 X-Patchwork-Delegate: bpf@iogearbox.net Return-Path: X-Original-To: incoming-bpf@patchwork.ozlabs.org Delivered-To: patchwork-incoming-bpf@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=bpf-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=intel.com Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 44vpc91ww4z9sBV for ; Thu, 2 May 2019 18:40:09 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726264AbfEBIkJ (ORCPT ); Thu, 2 May 2019 04:40:09 -0400 Received: from mga02.intel.com ([134.134.136.20]:64550 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1725944AbfEBIkI (ORCPT ); Thu, 2 May 2019 04:40:08 -0400 X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga101.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 02 May 2019 01:40:08 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.60,421,1549958400"; d="scan'208";a="296322531" Received: from mkarlsso-mobl.ger.corp.intel.com (HELO VM.isw.intel.com) ([10.103.211.43]) by orsmga004.jf.intel.com with ESMTP; 02 May 2019 01:40:04 -0700 From: Magnus Karlsson To: magnus.karlsson@intel.com, bjorn.topel@intel.com, ast@kernel.org, daniel@iogearbox.net, netdev@vger.kernel.org, brouer@redhat.com Cc: bpf@vger.kernel.org, bruce.richardson@intel.com, ciara.loftus@intel.com, jakub.kicinski@netronome.com, xiaolong.ye@intel.com, qi.z.zhang@intel.com, maximmi@mellanox.com, sridhar.samudrala@intel.com, kevin.laatz@intel.com Subject: [RFC bpf-next 7/7] samples/bpf: add busy-poll support to xdpsock sample Date: Thu, 2 May 2019 10:39:23 +0200 Message-Id: <1556786363-28743-8-git-send-email-magnus.karlsson@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> References: <1556786363-28743-1-git-send-email-magnus.karlsson@intel.com> Sender: bpf-owner@vger.kernel.org Precedence: bulk List-Id: netdev.vger.kernel.org This patch adds busy-poll support to the xdpsock sample application. It is enabled by the "-b" or the "--busy-poll" command line options. Signed-off-by: Magnus Karlsson --- samples/bpf/xdpsock_user.c | 203 ++++++++++++++++++++++++++++----------------- 1 file changed, 125 insertions(+), 78 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index d08ee1a..1272edf 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -66,6 +66,7 @@ static const char *opt_if = ""; static int opt_ifindex; static int opt_queue; static int opt_poll; +static int opt_busy_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags; static __u32 prog_id; @@ -119,8 +120,11 @@ static void print_benchmark(bool running) else printf(" "); - if (opt_poll) + if (opt_poll) { + if (opt_busy_poll) + printf("busy-"); printf("poll() "); + } if (running) { printf("running..."); @@ -306,7 +310,7 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem) xsk->umem = umem; cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - cfg.libbpf_flags = 0; + cfg.busy_poll = (opt_busy_poll ? BATCH_SIZE : 0); cfg.xdp_flags = opt_xdp_flags; cfg.bind_flags = opt_xdp_bind_flags; ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem, @@ -319,17 +323,17 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem) exit_with_error(-ret); ret = xsk_ring_prod__reserve(&xsk->umem->fq, - XSK_RING_PROD__DEFAULT_NUM_DESCS, + 1024, &idx); - if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) + if (ret != 1024) exit_with_error(-ret); for (i = 0; - i < XSK_RING_PROD__DEFAULT_NUM_DESCS * + i < 1024 * XSK_UMEM__DEFAULT_FRAME_SIZE; i += XSK_UMEM__DEFAULT_FRAME_SIZE) *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i; xsk_ring_prod__submit(&xsk->umem->fq, - XSK_RING_PROD__DEFAULT_NUM_DESCS); + 1024); return xsk; } @@ -341,6 +345,7 @@ static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"queue", required_argument, 0, 'q'}, {"poll", no_argument, 0, 'p'}, + {"busy-poll", no_argument, 0, 'b'}, {"xdp-skb", no_argument, 0, 'S'}, {"xdp-native", no_argument, 0, 'N'}, {"interval", required_argument, 0, 'n'}, @@ -360,6 +365,7 @@ static void usage(const char *prog) " -i, --interface=n Run on interface n\n" " -q, --queue=n Use queue n (default 0)\n" " -p, --poll Use poll syscall\n" + " -b, --busy-poll Use poll syscall with busy poll\n" " -S, --xdp-skb=n Use XDP skb-mod\n" " -N, --xdp-native=n Enfore XDP native mode\n" " -n, --interval=n Specify statistics update interval (default 1 sec).\n" @@ -377,7 +383,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:psSNn:cz", long_options, + c = getopt_long(argc, argv, "Frtli:q:pbsSNn:cz", long_options, &option_index); if (c == -1) break; @@ -401,6 +407,10 @@ static void parse_command_line(int argc, char **argv) case 'p': opt_poll = 1; break; + case 'b': + opt_busy_poll = 1; + opt_poll = 1; + break; case 'S': opt_xdp_flags |= XDP_FLAGS_SKB_MODE; opt_xdp_bind_flags |= XDP_COPY; @@ -444,7 +454,8 @@ static void kick_tx(struct xsk_socket_info *xsk) exit_with_error(errno); } -static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) +static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, + struct pollfd *fds) { u32 idx_cq = 0, idx_fq = 0; unsigned int rcvd; @@ -453,7 +464,8 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) if (!xsk->outstanding_tx) return; - kick_tx(xsk); + if (!opt_poll) + kick_tx(xsk); ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE : xsk->outstanding_tx; @@ -467,6 +479,8 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); + if (opt_busy_poll) + ret = poll(fds, num_socks, 0); ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } @@ -490,7 +504,8 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk) if (!xsk->outstanding_tx) return; - kick_tx(xsk); + if (!opt_busy_poll) + kick_tx(xsk); rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx); if (rcvd > 0) { @@ -500,10 +515,10 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk) } } -static void rx_drop(struct xsk_socket_info *xsk) +static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) { - unsigned int rcvd, i; u32 idx_rx = 0, idx_fq = 0; + unsigned int rcvd, i; int ret; rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); @@ -514,6 +529,8 @@ static void rx_drop(struct xsk_socket_info *xsk) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); + if (opt_busy_poll) + ret = poll(fds, num_socks, 0); ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } @@ -533,43 +550,68 @@ static void rx_drop(struct xsk_socket_info *xsk) static void rx_drop_all(void) { - struct pollfd fds[MAX_SOCKS + 1]; - int i, ret, timeout, nfds = 1; + struct pollfd fds[MAX_SOCKS]; + int i, ret; memset(fds, 0, sizeof(fds)); for (i = 0; i < num_socks; i++) { fds[i].fd = xsk_socket__fd(xsks[i]->xsk); fds[i].events = POLLIN; - timeout = 1000; /* 1sn */ } for (;;) { if (opt_poll) { - ret = poll(fds, nfds, timeout); + ret = poll(fds, num_socks, 0); if (ret <= 0) continue; } for (i = 0; i < num_socks; i++) - rx_drop(xsks[i]); + rx_drop(xsks[i], fds); + } +} + +static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb) +{ + u32 idx; + + if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == + BATCH_SIZE) { + unsigned int i; + + for (i = 0; i < BATCH_SIZE; i++) { + xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr + = (frame_nb + i) << + XSK_UMEM__DEFAULT_FRAME_SHIFT; + xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len = + sizeof(pkt_data) - 1; + } + + xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE); + xsk->outstanding_tx += BATCH_SIZE; + frame_nb += BATCH_SIZE; + frame_nb %= NUM_FRAMES; } + + complete_tx_only(xsk); } -static void tx_only(struct xsk_socket_info *xsk) +static void tx_only_all(void) { - int timeout, ret, nfds = 1; - struct pollfd fds[nfds + 1]; - u32 idx, frame_nb = 0; + struct pollfd fds[MAX_SOCKS]; + u32 frame_nb[MAX_SOCKS] = {}; + int i, ret; memset(fds, 0, sizeof(fds)); - fds[0].fd = xsk_socket__fd(xsk->xsk); - fds[0].events = POLLOUT; - timeout = 1000; /* 1sn */ + for (i = 0; i < num_socks; i++) { + fds[0].fd = xsk_socket__fd(xsks[i]->xsk); + fds[0].events = POLLOUT; + } for (;;) { if (opt_poll) { - ret = poll(fds, nfds, timeout); + ret = poll(fds, num_socks, 0); if (ret <= 0) continue; @@ -577,70 +619,75 @@ static void tx_only(struct xsk_socket_info *xsk) continue; } - if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == - BATCH_SIZE) { - unsigned int i; - - for (i = 0; i < BATCH_SIZE; i++) { - xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr - = (frame_nb + i) << - XSK_UMEM__DEFAULT_FRAME_SHIFT; - xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len = - sizeof(pkt_data) - 1; - } - - xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE); - xsk->outstanding_tx += BATCH_SIZE; - frame_nb += BATCH_SIZE; - frame_nb %= NUM_FRAMES; - } - - complete_tx_only(xsk); + for (i = 0; i < num_socks; i++) + tx_only(xsks[i], frame_nb[i]); } } -static void l2fwd(struct xsk_socket_info *xsk) +static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) { - for (;;) { - unsigned int rcvd, i; - u32 idx_rx = 0, idx_tx = 0; - int ret; + unsigned int rcvd, i; + u32 idx_rx = 0, idx_tx = 0; + int ret; - for (;;) { - complete_tx_l2fwd(xsk); + complete_tx_l2fwd(xsk, fds); - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, - &idx_rx); - if (rcvd > 0) - break; - } + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, + &idx_rx); + if (!rcvd) + return; + ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + if (opt_busy_poll) + ret = poll(fds, num_socks, 0); ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); - } + } + + for (i = 0; i < rcvd; i++) { + u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, + idx_rx)->addr; + u32 len = xsk_ring_cons__rx_desc(&xsk->rx, + idx_rx++)->len; + char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - for (i = 0; i < rcvd; i++) { - u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, - idx_rx)->addr; - u32 len = xsk_ring_cons__rx_desc(&xsk->rx, - idx_rx++)->len; - char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); + swap_mac_addresses(pkt); - swap_mac_addresses(pkt); + hex_dump(pkt, len, addr); + xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr; + xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; + } - hex_dump(pkt, len, addr); - xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr; - xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; - } + xsk_ring_prod__submit(&xsk->tx, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); - xsk_ring_prod__submit(&xsk->tx, rcvd); - xsk_ring_cons__release(&xsk->rx, rcvd); + xsk->rx_npkts += rcvd; + xsk->outstanding_tx += rcvd; +} - xsk->rx_npkts += rcvd; - xsk->outstanding_tx += rcvd; +static void l2fwd_all(void) +{ + struct pollfd fds[MAX_SOCKS]; + int i, ret; + + memset(fds, 0, sizeof(fds)); + + for (i = 0; i < num_socks; i++) { + fds[i].fd = xsk_socket__fd(xsks[i]->xsk); + fds[i].events = POLLOUT | POLLIN; + } + + for (;;) { + if (opt_poll) { + ret = poll(fds, num_socks, 0); + if (ret <= 0) + continue; + } + + for (i = 0; i < num_socks; i++) + l2fwd(xsks[i], fds); } } @@ -693,9 +740,9 @@ int main(int argc, char **argv) if (opt_bench == BENCH_RXDROP) rx_drop_all(); else if (opt_bench == BENCH_TXONLY) - tx_only(xsks[0]); + tx_only_all(); else - l2fwd(xsks[0]); + l2fwd_all(); return 0; }