@@ -1,5 +1,5 @@
OVS DPDK ADVANCED INSTALL GUIDE
-=================================
+===============================
## Contents
@@ -12,7 +12,8 @@ OVS DPDK ADVANCED INSTALL GUIDE
7. [QOS](#qos)
8. [Rate Limiting](#rl)
9. [Flow Control](#fc)
-10. [Vsperf](#vsperf)
+10. [Jumbo Frames](#jumbo)
+11. [Vsperf](#vsperf)
## <a name="overview"></a> 1. Overview
@@ -862,7 +863,58 @@ respective parameter. To disable the flow control at tx side,
`ovs-vsctl set Interface dpdk0 options:tx-flow-ctrl=false`
-## <a name="vsperf"></a> 10. Vsperf
+## <a name="jumbo"></a> 10. Jumbo Frames
+
+By default, DPDK ports are configured with standard Ethernet MTU (1500B). To
+enable Jumbo Frames support for a DPDK port, change the Interface's `mtu_request`
+attribute to a sufficiently large value.
+
+e.g. Add a DPDK Phy port with MTU of 9000:
+
+`ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk -- set Interface dpdk0 mtu_request=9000`
+
+e.g. Change the MTU of an existing port to 6200:
+
+`ovs-vsctl set Interface dpdk0 mtu_request=6200`
+
+When Jumbo Frames are enabled, the size of a DPDK port's mbuf segments are
+increased, such that a full Jumbo Frame of a specific size may be accommodated
+within a single mbuf segment.
+
+Jumbo frame support has been validated against 9728B frames (largest frame size
+supported by Fortville NIC), using the DPDK `i40e` driver, but larger frames
+(particularly in use cases involving East-West traffic only), and other DPDK NIC
+drivers may be supported.
+
+### 9.1 vHost Ports and Jumbo Frames
+
+Some additional configuration is needed to take advantage of jumbo frames with
+vhost ports:
+
+ 1. `mergeable buffers` must be enabled for vHost ports, as demonstrated in
+ the QEMU command line snippet below:
+
+ ```
+ '-netdev type=vhost-user,id=mynet1,chardev=char0,vhostforce \'
+ '-device virtio-net-pci,mac=00:00:00:00:00:01,netdev=mynet1,mrg_rxbuf=on'
+ ```
+
+ 2. Where virtio devices are bound to the Linux kernel driver in a guest
+ environment (i.e. interfaces are not bound to an in-guest DPDK driver),
+ the MTU of those logical network interfaces must also be increased to a
+ sufficiently large value. This avoids segmentation of Jumbo Frames
+ received in the guest. Note that 'MTU' refers to the length of the IP
+ packet only, and not that of the entire frame.
+
+ To calculate the exact MTU of a standard IPv4 frame, subtract the L2
+ header and CRC lengths (i.e. 18B) from the max supported frame size.
+ So, to set the MTU for a 9018B Jumbo Frame:
+
+ ```
+ ifconfig eth1 mtu 9000
+ ```
+
+## <a name="vsperf"></a> 11. Vsperf
Vsperf project goal is to develop vSwitch test framework that can be used to
validate the suitability of different vSwitch implementations in a Telco deployment
@@ -590,7 +590,6 @@ can be found in [Vhost Walkthrough].
## <a name="ovslimits"></a> 6. Limitations
- - Supports MTU size 1500, MTU setting for DPDK netdevs will be in future OVS release.
- Currently DPDK ports does not use HW offload functionality.
- Network Interface Firmware requirements:
Each release of DPDK is validated against a specific firmware version for
@@ -69,6 +69,7 @@ Post-v2.5.0
* Basic connection tracking for the userspace datapath (no ALG,
fragmentation or NAT support yet)
* Support for DPDK 16.07
+ * Jumbo frame support
- Increase number of registers to 16.
- ovs-benchmark: This utility has been removed due to lack of use and
bitrot.
@@ -81,6 +81,7 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+ sizeof(struct dp_packet) \
+ RTE_PKTMBUF_HEADROOM)
#define NETDEV_DPDK_MBUF_ALIGN 1024
+#define NETDEV_DPDK_MAX_PKT_LEN 9728
/* Max and min number of packets in the mempool. OVS tries to allocate a
* mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
@@ -368,6 +369,7 @@ struct netdev_dpdk {
/* The following properties cannot be changed when a device is running,
* so we remember the request and update them next time
* netdev_dpdk*_reconfigure() is called */
+ int requested_mtu;
int requested_n_txq;
int requested_n_rxq;
@@ -477,10 +479,19 @@ dpdk_mp_get(int socket_id, int mtu) OVS_REQUIRES(dpdk_mutex)
dmp->mtu = mtu;
dmp->refcount = 1;
mbp_priv.mbuf_data_room_size = MBUF_SIZE(mtu) - sizeof(struct dp_packet);
- mbp_priv.mbuf_priv_size = sizeof (struct dp_packet) -
- sizeof (struct rte_mbuf);
+ mbp_priv.mbuf_priv_size = sizeof (struct dp_packet)
+ - sizeof (struct rte_mbuf);
+ /* XXX: this is a really rough method of provisioning memory.
+ * It's impossible to determine what the exact memory requirements are when
+ * the number of ports and rxqs that utilize a particular mempool can change
+ * dynamically at runtime. For the moment, use this rough heurisitic.
+ */
+ if (mtu >= ETHER_MTU) {
+ mp_size = MAX_NB_MBUF;
+ } else {
+ mp_size = MIN_NB_MBUF;
+ }
- mp_size = MAX_NB_MBUF;
do {
if (snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs_mp_%d_%d_%u",
dmp->mtu, dmp->socket_id, mp_size) < 0) {
@@ -520,6 +531,35 @@ dpdk_mp_put(struct dpdk_mp *dmp) OVS_REQUIRES(dpdk_mutex)
}
}
+/* Tries to allocate new mempool on requested_socket_id with
+ * mbuf size corresponding to requested_mtu.
+ * On success new configuration will be applied.
+ * On error, device will be left unchanged. */
+static int
+netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
+ OVS_REQUIRES(dpdk_mutex)
+ OVS_REQUIRES(dev->mutex)
+{
+ uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
+ struct dpdk_mp *mp;
+
+ mp = dpdk_mp_get(dev->requested_socket_id, FRAME_LEN_TO_MTU(buf_size));
+ if (!mp) {
+ VLOG_ERR("Insufficient memory to create memory pool for netdev "
+ "%s, with MTU %d on socket %d\n",
+ dev->up.name, dev->requested_mtu, dev->requested_socket_id);
+ return ENOMEM;
+ } else {
+ dpdk_mp_put(dev->dpdk_mp);
+ dev->dpdk_mp = mp;
+ dev->mtu = dev->requested_mtu;
+ dev->socket_id = dev->requested_socket_id;
+ dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
+ }
+
+ return 0;
+}
+
static void
check_link_status(struct netdev_dpdk *dev)
{
@@ -571,7 +611,15 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
{
int diag = 0;
int i;
+ struct rte_eth_conf conf = port_conf;
+ if (dev->mtu > ETHER_MTU) {
+ conf.rxmode.jumbo_frame = 1;
+ conf.rxmode.max_rx_pkt_len = dev->max_packet_len;
+ } else {
+ conf.rxmode.jumbo_frame = 0;
+ conf.rxmode.max_rx_pkt_len = 0;
+ }
/* A device may report more queues than it makes available (this has
* been observed for Intel xl710, which reserves some of them for
* SRIOV): rte_eth_*_queue_setup will fail if a queue is not
@@ -582,8 +630,10 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
}
- diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
+ diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
if (diag) {
+ VLOG_WARN("Interface %s eth_dev setup error %s\n",
+ dev->up.name, rte_strerror(-diag));
break;
}
@@ -736,7 +786,6 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int sid;
int err = 0;
- uint32_t buf_size;
ovs_mutex_init(&dev->mutex);
ovs_mutex_lock(&dev->mutex);
@@ -757,15 +806,13 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
dev->port_id = port_no;
dev->type = type;
dev->flags = 0;
- dev->mtu = ETHER_MTU;
+ dev->requested_mtu = dev->mtu = ETHER_MTU;
dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
ovsrcu_index_init(&dev->vid, -1);
dev->vhost_reconfigured = false;
- buf_size = dpdk_buf_size(dev->mtu);
- dev->dpdk_mp = dpdk_mp_get(dev->socket_id, FRAME_LEN_TO_MTU(buf_size));
- if (!dev->dpdk_mp) {
- err = ENOMEM;
+ err = netdev_dpdk_mempool_configure(dev);
+ if (err) {
goto unlock;
}
@@ -991,6 +1038,7 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
+ smap_add_format(args, "mtu", "%d", dev->mtu);
ovs_mutex_unlock(&dev->mutex);
return 0;
@@ -1366,6 +1414,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
unsigned int total_pkts = cnt;
unsigned int qos_pkts = cnt;
+ unsigned int mtu_dropped = 0;
int retries = 0;
qid = dev->tx_q[qid % netdev->n_txq].map;
@@ -1387,25 +1436,41 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
do {
int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
unsigned int tx_pkts;
+ unsigned int try_tx_pkts = cnt;
+ for (unsigned int i = 0; i < cnt; i++) {
+ if (cur_pkts[i]->pkt_len > dev->max_packet_len) {
+ try_tx_pkts = i;
+ break;
+ }
+ }
+ if (!try_tx_pkts) {
+ cur_pkts++;
+ mtu_dropped++;
+ cnt--;
+ continue;
+ }
tx_pkts = rte_vhost_enqueue_burst(netdev_dpdk_get_vid(dev),
- vhost_qid, cur_pkts, cnt);
+ vhost_qid, cur_pkts, try_tx_pkts);
if (OVS_LIKELY(tx_pkts)) {
/* Packets have been sent.*/
cnt -= tx_pkts;
/* Prepare for possible retry.*/
cur_pkts = &cur_pkts[tx_pkts];
+ if (tx_pkts != try_tx_pkts) {
+ retries++;
+ }
} else {
/* No packets sent - do not retry.*/
break;
}
- } while (cnt && (retries++ < VHOST_ENQ_RETRY_NUM));
+ } while (cnt && (retries <= VHOST_ENQ_RETRY_NUM));
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
rte_spinlock_lock(&dev->stats_lock);
- cnt += qos_pkts;
- netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, cnt);
+ netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
+ cnt + mtu_dropped + qos_pkts);
rte_spinlock_unlock(&dev->stats_lock);
out:
@@ -1639,6 +1704,27 @@ netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
}
static int
+netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
+{
+ struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+
+ if (MTU_TO_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
+ || mtu < ETHER_MIN_MTU) {
+ VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
+ return EINVAL;
+ }
+
+ ovs_mutex_lock(&dev->mutex);
+ if (dev->requested_mtu != mtu) {
+ dev->requested_mtu = mtu;
+ netdev_request_reconfigure(netdev);
+ }
+ ovs_mutex_unlock(&dev->mutex);
+
+ return 0;
+}
+
+static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
static int
@@ -2803,7 +2889,8 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
ovs_mutex_lock(&dev->mutex);
if (netdev->n_txq == dev->requested_n_txq
- && netdev->n_rxq == dev->requested_n_rxq) {
+ && netdev->n_rxq == dev->requested_n_rxq
+ && dev->mtu == dev->requested_mtu) {
/* Reconfiguration is unnecessary */
goto out;
@@ -2811,6 +2898,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
rte_eth_dev_stop(dev->port_id);
+ if (dev->mtu != dev->requested_mtu) {
+ netdev_dpdk_mempool_configure(dev);
+ }
+
netdev->n_txq = dev->requested_n_txq;
netdev->n_rxq = dev->requested_n_rxq;
@@ -2818,6 +2909,8 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
err = dpdk_eth_dev_init(dev);
netdev_dpdk_alloc_txq(dev, netdev->n_txq);
+ netdev_change_seq_changed(netdev);
+
out:
ovs_mutex_unlock(&dev->mutex);
@@ -2830,7 +2923,6 @@ static int
netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
- int err = 0;
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
@@ -2845,13 +2937,10 @@ netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev)
netdev_dpdk_remap_txqs(dev);
- if (dev->requested_socket_id != dev->socket_id) {
- dev->socket_id = dev->requested_socket_id;
- /* Change mempool to new NUMA Node */
- dpdk_mp_put(dev->dpdk_mp);
- dev->dpdk_mp = dpdk_mp_get(dev->socket_id, dev->mtu);
- if (!dev->dpdk_mp) {
- err = ENOMEM;
+ if (dev->requested_socket_id != dev->socket_id
+ || dev->requested_mtu != dev->mtu) {
+ if (!netdev_dpdk_mempool_configure(dev)) {
+ netdev_change_seq_changed(netdev);
}
}
@@ -2862,7 +2951,7 @@ netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev)
ovs_mutex_unlock(&dev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
- return err;
+ return 0;
}
static int
@@ -2876,6 +2965,12 @@ netdev_dpdk_vhost_cuse_reconfigure(struct netdev *netdev)
netdev->n_txq = dev->requested_n_txq;
netdev->n_rxq = 1;
+ if (dev->requested_mtu != dev->mtu) {
+ if (!netdev_dpdk_mempool_configure(dev)) {
+ netdev_change_seq_changed(netdev);
+ }
+ }
+
ovs_mutex_unlock(&dev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
@@ -2913,7 +3008,7 @@ netdev_dpdk_vhost_cuse_reconfigure(struct netdev *netdev)
netdev_dpdk_set_etheraddr, \
netdev_dpdk_get_etheraddr, \
netdev_dpdk_get_mtu, \
- NULL, /* set_mtu */ \
+ netdev_dpdk_set_mtu, \
netdev_dpdk_get_ifindex, \
GET_CARRIER, \
netdev_dpdk_get_carrier_resets, \