diff mbox

[ovs-dev,RFC] tunneling: Improving vxlan performance using DPDK flow director feature.

Message ID 1458252015-52459-1-git-send-email-sugesh.chandran@intel.com
State RFC
Headers show

Commit Message

Chandran, Sugesh March 17, 2016, 10 p.m. UTC
Optimizing vxlan tunneling performance in userspace datapath using
flow director feature in Fortville NIC DPDK ports. OVS uses metadata
 reported by NIC to improve the flow lookup performance on VxLAN
 packets.

Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
---
 lib/automake.mk      |   2 +
 lib/dpdk-i40e-ofld.c | 266 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/dpdk-i40e-ofld.h |  59 ++++++++++++
 lib/dpif-netdev.c    | 118 ++++++++++++++++++++++-
 lib/netdev-dpdk.c    |  41 +++++++-
 5 files changed, 481 insertions(+), 5 deletions(-)
 create mode 100644 lib/dpdk-i40e-ofld.c
 create mode 100644 lib/dpdk-i40e-ofld.h

Comments

Chandran, Sugesh March 17, 2016, 10:43 p.m. UTC | #1
Hi,

This patch proposes an approach that uses Flow director feature on the Intel Fortville NICs to boost the VxLAN tunneling performance. In our testing we verified that the VxLAN performance is almost doubled with this patch. 
The solution programs the NIC to report the flow ID along with the VxLAN packets, and it is matched by OVS in software. There may be corner cases that needs to addressed in the approach, For eg:  There is a possibility of race condition where NIC reports flow ID that may match on different flow in OVS. This happen when a rule is evicted by a new rule with same flowID+ hash in the OVS software. The packets may hit on wrong new rule in OVS until the flow get deleted in the hardware too.

It is a hardware specific implementation (Only work with Intel Fortville NICs) for now, however the proposal works with any programmable NICs.This RFC proves that the OVS can offer very high speed tunneling performance using flow programmability in NICs. I am looking for comments/suggestions on adding this support(such as configuring, enable it for all the programmable NICs and etc) in OVS userspace datapath for improving the performance.

Regards
_Sugesh


> -----Original Message-----
> From: Chandran, Sugesh
> Sent: Thursday, March 17, 2016 10:00 PM
> To: dev@openvswitch.org
> Cc: Chandran, Sugesh <sugesh.chandran@intel.com>
> Subject: [RFC PATCH] tunneling: Improving vxlan performance using DPDK
> flow director feature.
> 
> Optimizing vxlan tunneling performance in userspace datapath using flow
> director feature in Fortville NIC DPDK ports. OVS uses metadata  reported by
> NIC to improve the flow lookup performance on VxLAN  packets.
> 
> Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
> ---
>  lib/automake.mk      |   2 +
>  lib/dpdk-i40e-ofld.c | 266
> +++++++++++++++++++++++++++++++++++++++++++++++++++
>  lib/dpdk-i40e-ofld.h |  59 ++++++++++++
>  lib/dpif-netdev.c    | 118 ++++++++++++++++++++++-
>  lib/netdev-dpdk.c    |  41 +++++++-
>  5 files changed, 481 insertions(+), 5 deletions(-)  create mode 100644
> lib/dpdk-i40e-ofld.c  create mode 100644 lib/dpdk-i40e-ofld.h
> 
> diff --git a/lib/automake.mk b/lib/automake.mk index 27a1669..da48479
> 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -366,6 +366,8 @@ endif
> 
>  if DPDK_NETDEV
>  lib_libopenvswitch_la_SOURCES += \
> +       lib/dpdk-i40e-ofld.c \
> +       lib/dpdk-i40e-ofld.h \
>         lib/netdev-dpdk.c \
>         lib/netdev-dpdk.h
>  endif
> diff --git a/lib/dpdk-i40e-ofld.c b/lib/dpdk-i40e-ofld.c new file mode 100644
> index 0000000..3ea7084
> --- /dev/null
> +++ b/lib/dpdk-i40e-ofld.c
> @@ -0,0 +1,266 @@
> +/*
> + * Copyright (c) 2016 Intel Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +
> +#include "dpdk-i40e-ofld.h"
> +#include "errno.h"
> +#include "ovs-thread.h"
> +#include "openvswitch/vlog.h"
> +#include "netdev-provider.h"
> +#include "rte_pci_dev_ids.h"
> +#include "rte_ethdev.h"
> +
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +VLOG_DEFINE_THIS_MODULE(dpdk_hw_ofld);
> +
> +#define VXLAN_DST_PORT          4789
> +#define VXLAN_HLEN                  50
> +#define MAX_FDIR_RULES          8000
> +
> +static uint32_t total_fdir_ids;
> +static struct ovs_mutex hw_ofld_mutex = OVS_MUTEX_INITIALIZER;
> +
> +/*
> + * Returns '0' if FDIR IDs reaches max limit. Only 8000 entries are
> + * supported in FVL.
> + */
> +static inline uint32_t
> +i40e_fdir_entry_cnt_inc(void)
> +{
> +    if (total_fdir_ids < MAX_FDIR_RULES) {
> +        ovs_mutex_lock(&hw_ofld_mutex);
> +        total_fdir_ids++;
> +        ovs_mutex_unlock(&hw_ofld_mutex);
> +        return (total_fdir_ids);
> +    }
> +    return 0;
> +}
> +
> +static inline void
> +i40e_fdir_entry_cnt_decr(void)
> +{
> +    ovs_mutex_lock(&hw_ofld_mutex);
> +    total_fdir_ids ? total_fdir_ids-- : 0;
> +    ovs_mutex_unlock(&hw_ofld_mutex);
> +}
> +
> +/*
> + * Release the hardware offloading functionality from the dpdk-port.
> + */
> +int
> +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port) {
> +    ovs_mutex_lock(&hw_ofld_mutex);
> +    set_i40e_ofld_flag(dpdk_port, 0);
> +    ovs_mutex_unlock(&hw_ofld_mutex);
> +    return 0;
> +}
> +
> +int
> +dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev,
> +                                        int n_rxq, int n_txq,
> +                                        struct rte_eth_conf *port_conf)
> +{
> +    int err = 0;
> +    struct rte_eth_dev_info info;
> +    uint16_t vendor_id, device_id;
> +
> +    rte_eth_dev_info_get(get_dpdk_port_id(dev), &info);
> +    vendor_id = info.pci_dev->id.vendor_id;
> +    device_id = info.pci_dev->id.device_id;
> +    /* Configure vxlan offload only if its FVL NIC */
> +    if (vendor_id != PCI_VENDOR_ID_INTEL || device_id !=
> +                                            I40E_DEV_ID_SFP_XL710) {
> +        ovs_mutex_lock(&hw_ofld_mutex);
> +        set_i40e_ofld_flag(dev, 0);
> +        ovs_mutex_unlock(&hw_ofld_mutex);
> +        err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> +                                    n_rxq, n_txq, port_conf);
> +        return err;
> +    }
> +    ovs_mutex_lock(&hw_ofld_mutex);
> +    set_i40e_ofld_flag(dev, 1);
> +    ovs_mutex_unlock(&hw_ofld_mutex);
> +    /* Configure FVL FDIR VxLAN tunnel handing */
> +    port_conf->fdir_conf.mode = RTE_FDIR_MODE_PERFECT;
> +    port_conf->fdir_conf.flex_conf.nb_payloads = 1;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].type =
> RTE_ETH_L4_PAYLOAD;
> +    /* Need to initilize all the 16 flex bytes,no matter;
> +     * what we really using, possibly a DPDK bug?? */
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[0] = 0;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[1] = 1;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[2] = 2;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[3] = 3;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[4] = 4;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[5] = 5;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[6] = 6;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[7] = 7;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[8] = 8;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[9] = 9;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[10] = 10;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[11] = 11;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[12] = 12;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[13] = 13;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[14] = 14;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[15] = 15;
> +    err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> +                                n_rxq, n_txq, port_conf);
> +    if (err) {
> +        VLOG_ERR("Failed to configure DPDK port with hardware offload");
> +        return err;
> +    }
> +    /*Clean all FDIR entries if any */
> +    err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(dev),
> +            RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_FLUSH, NULL);
> +    return err;
> +}
> +
> +/*
> + * Install rules for VxLAN packets in hardware  */ int
> +set_up_hw_offload_port_rule(struct netdev *netdev__,
> +                                const struct flow *flow,
> +                                const uint32_t hw_flow_id,
> +                                const bool is_add_rule) {
> +    int err = 0;
> +    uint8_t flexbytes[RTE_ETH_FDIR_MAX_FLEXLEN] = { 0 };
> +    uint32_t *vni;
> +    enum rte_filter_op filter_op;
> +    struct rte_eth_fdir_filter entry = { 0 };
> +    struct netdev_dpdk *netdev;
> +
> +    netdev = netdev_dpdk_cast(netdev__);
> +    if (is_i40e_ofld_enable(netdev)) {
> +        entry.soft_id = hw_flow_id;
> +        if (!entry.soft_id) {
> +            VLOG_DBG("Invalid flow ID, Cant install rule in the NIC for "
> +                             "hardware offload");
> +            err = ECANCELED;
> +            return err;
> +        }
> +        /* Install rules in NIC only for VxLAN flows */
> +        if (ntohs(flow->tp_dst) != VXLAN_DST_PORT) {
> +            return 0;
> +        }
> +        entry.action.flex_off = 0;  /* use 0 by default */
> +        entry.input.flow_ext.vlan_tci = 0; //! ignored by i40e fdir
> +        entry.action.behavior = RTE_ETH_FDIR_PASSTHRU;
> +        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID_FLEX_4;
> +        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
> +        entry.input.flow.ip4_flow.src_ip = flow->nw_src;
> +        entry.input.flow.ip4_flow.dst_ip = flow->nw_dst;
> +        entry.input.flow.udp4_flow.dst_port = htons(VXLAN_DST_PORT);
> +        entry.input.flow.udp4_flow.src_port = flow->tp_src;
> +        vni = (uint32_t *)&flexbytes[4];
> +        *vni = flow->tunnel.tun_id << 8;
> +        memcpy(entry.input.flow_ext.flexbytes, flexbytes,
> +                      RTE_ETH_FDIR_MAX_FLEXLEN);
> +        entry.action.rx_queue = 0;
> +        filter_op = is_add_rule ? RTE_ETH_FILTER_ADD :
> +                                              RTE_ETH_FILTER_DELETE;
> +        err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(netdev),
> +                 RTE_ETH_FILTER_FDIR, filter_op, &entry);
> +
> +        /*
> +         * XXX : Delayed the max limit check for flow director entries after
> +         * the configuration. Anyway the rte_eth_dev_filter_ctrl will fail if
> +         * max limit reaches. This can be used for tracking.
> +         */
> +        if (is_add_rule) {
> +            if (!i40e_fdir_entry_cnt_inc()) {
> +                VLOG_DBG("Cant configure rule on NIC, Flow director "
> +                        "entries hits max limit");
> +            }
> +        }
> +        else {
> +            i40e_fdir_entry_cnt_decr();
> +        }
> +        if (err < 0) {
> +            VLOG_DBG("flow director programming error in NIC: (%d)\n", err);
> +            return err;
> +        }
> +    }
> +    return err;
> +}
> +
> +static int
> +i40e_dpdk_port_get_hw_ofld_pkts(struct
> +                 dp_netdev_pmd_thread *pmd, struct dp_packet
> +                 **in_packets, struct dp_packet **hw_packets,
> +                 struct dp_packet **non_hw_packets,
> +                 uint32_t cnt)
> +{
> +    int i, hw_pkt_cnt = 0, norm_pkt_cnt = 0;
> +    const struct dp_netdev_flow *flow;
> +    struct rte_mbuf *mbuf;
> +
> +    for (i = 0; i < cnt; i++) {
> +        mbuf = (struct rte_mbuf *)in_packets[i];
> +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
> +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf,
> +                                                     mbuf->hash.fdir.hi);
> +            if (!flow) {
> +                /* Bogus flow in hw, cannot find it in OVS EMC */
> +                mbuf->ol_flags &= ~PKT_RX_FDIR_ID;
> +                non_hw_packets[norm_pkt_cnt++] = in_packets[i];
> +                continue;
> +            }
> +            dp_packet_reset_packet(in_packets[i], VXLAN_HLEN);
> +            mbuf->ol_flags |= PKT_RX_RSS_HASH;
> +            mbuf->hash.rss = hash_finish(mbuf->hash.rss, 1);
> +            hw_packets[hw_pkt_cnt++] = in_packets[i];
> +        }
> +        else {
> +            non_hw_packets[norm_pkt_cnt++] = in_packets[i];
> +        }
> +    }
> +    return hw_pkt_cnt;
> +}
> +
> +/*
> + * Process the packets based on hardware offload configuration  */ void
> +hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> +                             struct netdev_rxq *netdev_rxq,
> +                             struct dp_packet **packets, int cnt,
> +                             odp_port_t port_no) {
> +    int hw_pkt_cnt;
> +    struct dp_packet *hw_ofld_packets[NETDEV_MAX_BURST] = { 0 };
> +    struct dp_packet *orig_packets[NETDEV_MAX_BURST] = { 0 };
> +    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_rxq->netdev);
> +
> +    if (is_i40e_ofld_enable(netdev)) {
> +        hw_pkt_cnt = i40e_dpdk_port_get_hw_ofld_pkts(pmd, packets,
> +                                                          hw_ofld_packets,
> +                                                          orig_packets, cnt);
> +        /* Process packet streams separately. */
> +        if (hw_pkt_cnt) {
> +            dp_netdev_input(pmd, hw_ofld_packets, hw_pkt_cnt, port_no);
> +        }
> +        if (cnt - hw_pkt_cnt) {
> +            dp_netdev_input(pmd, orig_packets, (cnt - hw_pkt_cnt), port_no);
> +        }
> +    }
> +    else {
> +        dp_netdev_input(pmd, packets, cnt, port_no);
> +    }
> +}
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> diff --git a/lib/dpdk-i40e-ofld.h b/lib/dpdk-i40e-ofld.h new file mode 100644
> index 0000000..1aad246
> --- /dev/null
> +++ b/lib/dpdk-i40e-ofld.h
> @@ -0,0 +1,59 @@
> +/*
> + * Copyright (c) 2016 Intel Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef DPDK_I40E_OFLD_H_
> +#define DPDK_I40E_OFLD_H_
> +
> +#include <config.h>
> +
> +#include "dp-packet.h"
> +#include "netdev.h"
> +
> +/*
> + * Macro to enable/disable HW OFFLOAD feature for DPDK.
> + * 1 :- Enable HW_OFFLOAD support in OVS
> + * 0 :- Disable HW_OFFLOAD support in OVS  */
> +#define DPDK_I40E_TNL_OFFLOAD_ENABLE        1
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +
> +struct netdev_dpdk;
> +struct dp_netdev_pmd_thread;
> +struct dp_netdev_flow;
> +
> +struct netdev_dpdk *netdev_dpdk_cast(const struct netdev *netdev);
> +extern inline bool is_i40e_ofld_enable(const struct netdev_dpdk
> +*netdev); extern inline void set_i40e_ofld_flag(struct netdev_dpdk
> +*netdev, bool flag); extern inline int get_dpdk_port_id(struct
> +netdev_dpdk *dpdk_port); int dpdk_eth_dev_hw_ofld_init(struct
> netdev_dpdk *dev, int n_rxq, int n_txq,
> +                              struct rte_eth_conf *port_conf); int
> +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port); int
> +set_up_hw_offload_port_rule(struct netdev *netdev__,
> +                                const struct flow *flow,
> +                                const uint32_t hw_flow_id,
> +                                const bool is_add_rule); void
> +hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> +                             struct netdev_rxq *netdev_rxq,
> +                             struct dp_packet **packets, int cnt,
> +                             odp_port_t port_no); const struct
> +dp_netdev_flow *lookup_hw_offload_flow_for_fdirid(
> +                            const struct dp_netdev_pmd_thread *pmd,
> +                            struct rte_mbuf *mbuf, uint32_t flow_id);
> +void dp_netdev_input(struct dp_netdev_pmd_thread *, struct dp_packet
> **,
> +                     int cnt, odp_port_t port_no);
> +
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> +#endif /* DPDK_I40E_OFLD_H_ */
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index cf574ad..d79b239
> 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -70,6 +70,7 @@
>  #include "util.h"
> 
>  #include "openvswitch/vlog.h"
> +#include "dpdk-i40e-ofld.h"
> 
>  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> 
> @@ -478,7 +479,7 @@ static void dp_netdev_execute_actions(struct
> dp_netdev_pmd_thread *pmd,
>                                        bool may_steal,
>                                        const struct nlattr *actions,
>                                        size_t actions_len); -static void dp_netdev_input(struct
> dp_netdev_pmd_thread *,
> +void dp_netdev_input(struct dp_netdev_pmd_thread *,
>                              struct dp_packet **, int cnt, odp_port_t port_no);  static void
> dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
>                                    struct dp_packet **, int cnt); @@ -1455,6 +1456,28 @@
> dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
>      flow->dead = true;
> 
>      dp_netdev_flow_unref(flow);
> +
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +    struct dp_netdev_port *dp_port;
> +    int err;
> +    odp_port_t in_port = flow->flow.in_port.odp_port;
> +    err = get_port_by_number(pmd->dp, in_port, &dp_port);
> +    if (err) {
> +        VLOG_WARN("Cannot get the port information, hardware offload may
> "
> +                "not be functional");
> +        return;
> +    }
> +    if(strcmp(dp_port->type, "dpdk")) {
> +        /* No hardware offload on a non-DPDK port") */
> +        return;
> +    }
> +    /* Remove the hardware offload rule if exists.*/
> +    if(set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
> +            dp_netdev_flow_hash(&(flow->ufid)), 0)) {
> +        VLOG_DBG("Failed to delete the hardware offload rule");
> +        return;
> +    }
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
>  }
> 
>  static void
> @@ -2059,6 +2082,32 @@ dp_netdev_flow_add(struct
> dp_netdev_pmd_thread *pmd,
>          ds_destroy(&ds);
>      }
> 
> +    /*
> +     * Configure the hardware offload for tunnel while flows are getting
> +     * inserted in OVS.
> +     */
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +    struct dp_netdev_port *dp_port;
> +    int err;
> +    odp_port_t in_port = flow->flow.in_port.odp_port;
> +    err = get_port_by_number(pmd->dp, in_port, &dp_port);
> +    if (err) {
> +        VLOG_WARN("Cannot get the port information, Failed to configure "
> +                            "hardware offload");
> +        goto out;
> +    }
> +    if (strcmp(dp_port->type, "dpdk")) {
> +        /* No hardware offload on a non-DPDK port */
> +        goto out;
> +    }
> +    /* install the rule in hw, reduntant might overwrite if it exists*/
> +    if (set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
> +            dp_netdev_flow_hash(&flow->ufid), 1)) {
> +        VLOG_ERR("Failed to install the hardware offload rule");
> +        goto out;
> +    }
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> +out:
>      return flow;
>  }
> 
> @@ -2575,7 +2624,19 @@ dp_netdev_process_rxq_port(struct
> dp_netdev_pmd_thread *pmd,
>          *recirc_depth_get() = 0;
> 
>          cycles_count_start(pmd);
> +
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +        /* Check if the source port is DPDK */
> +        if (packets[0]->source == DPBUF_DPDK) {
> +            hw_ofld_dp_netdev_input(pmd, rxq, packets, cnt, port->port_no);
> +        }
> +        else {
> +            dp_netdev_input(pmd, packets, cnt, port->port_no);
> +        }
> +#else
>          dp_netdev_input(pmd, packets, cnt, port->port_no);
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> +
>          cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
>      } else if (error != EAGAIN && error != EOPNOTSUPP) {
>          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); @@ -
> 3321,7 +3382,6 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread
> *pmd, struct dp_packet *packet_,
>          flow->tunnel.metadata.present.len =
> orig_tunnel.metadata.present.len;
>          flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
>      }
> -
>      return err;
>  }
> 
> @@ -3430,6 +3490,7 @@ emc_processing(struct dp_netdev_pmd_thread
> *pmd, struct dp_packet **packets,
>      struct emc_cache *flow_cache = &pmd->flow_cache;
>      struct netdev_flow_key *key = &keys[0];
>      size_t i, n_missed = 0, n_dropped = 0;
> +    struct rte_mbuf *mbuf;
> 
>      for (i = 0; i < cnt; i++) {
>          struct dp_netdev_flow *flow;
> @@ -3454,7 +3515,18 @@ emc_processing(struct dp_netdev_pmd_thread
> *pmd, struct dp_packet **packets,
>          key->len = 0; /* Not computed yet. */
>          key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
> 
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +        mbuf = (struct rte_mbuf *)packet;
> +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
> +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf, 0);
> +        }
> +        else {
> +            flow = emc_lookup(flow_cache, key);
> +        }
> +#else
>          flow = emc_lookup(flow_cache, key);
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> +
>          if (OVS_LIKELY(flow)) {
>              dp_netdev_queue_batches(packet, flow, &key->mf, batches,
>                                      n_batches); @@ -3651,7 +3723,7 @@
> dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
>      }
>  }
> 
> -static void
> +void
>  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
>                  struct dp_packet **packets, int cnt,
>                  odp_port_t port_no)
> @@ -4290,3 +4362,43 @@ dpcls_lookup(const struct dpcls *cls, const struct
> netdev_flow_key keys[],
>      }
>      return false;                     /* Some misses. */
>  }
> +
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +/*
> + * EMC lookup function on 'flow id' reported by NIC.
> + */
> +const struct dp_netdev_flow *
> +lookup_hw_offload_flow_for_fdirid(const struct
> +                 dp_netdev_pmd_thread *pmd, struct rte_mbuf *mbuf,
> +                 uint32_t flow_id)
> +{
> +    const struct emc_cache *flow_cache = &pmd->flow_cache;
> +    struct netdev_flow_key key;
> +    struct emc_entry *current_entry;
> +
> +    key.len = 0;
> +    if (OVS_LIKELY(mbuf->ol_flags & PKT_RX_RSS_HASH)) {
> +        key.hash = mbuf->hash.rss;
> +    }
> +    else {
> +        return NULL;
> +    }
> +    EMC_FOR_EACH_POS_WITH_HASH(flow_cache, current_entry,
> key.hash) {
> +        if (current_entry->key.hash == key.hash
> +            && emc_entry_alive(current_entry)) {
> +            if (OVS_UNLIKELY(flow_id && dp_netdev_flow_hash(
> +                                       &current_entry->flow->ufid) !=
> +                                       flow_id)) {
> +                /* Hash collision in emc, fallback to software path */
> +                return NULL;
> +            }
> +            return current_entry->flow;
> +        }
> +    }
> +    /* XXX :: An improved classifier lookup needed here without any miniflow
> +     * extract to keep it performant.Until then fallback to software based
> +     * packet forwarding on EMC miss.
> +     */
> +     return NULL;
> +}
> +#endif /* DPDK_I40E_TNL_OFFLOAD_ENABLE */
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index f402354..2954f83
> 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -56,6 +56,7 @@
>  #include "rte_mbuf.h"
>  #include "rte_meter.h"
>  #include "rte_virtio_net.h"
> +#include "dpdk-i40e-ofld.h"
> 
>  VLOG_DEFINE_THIS_MODULE(dpdk);
>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> @@ -112,7 +113,7 @@ static char *vhost_sock_dir = NULL;   /* Location of
> vhost-user sockets */
>   */
>  #define VHOST_ENQ_RETRY_USECS 100
> 
> -static const struct rte_eth_conf port_conf = {
> +static struct rte_eth_conf port_conf = {
>      .rxmode = {
>          .mq_mode = ETH_MQ_RX_RSS,
>          .split_hdr_size = 0,
> @@ -331,6 +332,9 @@ struct netdev_dpdk {
> 
>      /* Identifier used to distinguish vhost devices from each other */
>      char vhost_id[PATH_MAX];
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +    bool i40e_ofld_enable; /* hardware/NIC offload flag*/ #endif
> +//DPDK_I40E_TNL_OFFLOAD_ENABLE
> 
>      /* In dpdk_list. */
>      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -346,6
> +350,24 @@ struct netdev_rxq_dpdk {
>      int port_id;
>  };
> 
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +inline bool is_i40e_ofld_enable(const struct netdev_dpdk *netdev) {
> +    return netdev->i40e_ofld_enable;
> +}
> +
> +inline void set_i40e_ofld_flag(struct netdev_dpdk *netdev,
> +                                                bool flag) {
> +    netdev->i40e_ofld_enable = flag;
> +}
> +
> +inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port) {
> +    return dpdk_port->port_id;
> +}
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> +
>  static bool dpdk_thread_is_pmd(void);
> 
>  static int netdev_dpdk_construct(struct netdev *); @@ -539,10 +561,21 @@
> dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int
> n_txq)
>              VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
>          }
> 
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +        diag = (!dev->i40e_ofld_enable && dev->type == DPDK_DEV_ETH) ?
> +                    dpdk_eth_dev_hw_ofld_init(dev, n_rxq, n_txq, &port_conf) :
> +                    rte_eth_dev_configure(dev->port_id,
> +                    n_rxq, n_txq, &port_conf);
> +        if (diag) {
> +            /* rte_dev_configure error */
> +            break;
> +        }
> +#else
>          diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
>          if (diag) {
>              break;
>          }
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> 
>          for (i = 0; i < n_txq; i++) {
>              diag = rte_eth_tx_queue_setup(dev->port_id, i,
> NIC_PORT_TX_Q_SIZE, @@ -637,7 +670,7 @@ dpdk_eth_dev_init(struct
> netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
>      return 0;
>  }
> 
> -static struct netdev_dpdk *
> +struct netdev_dpdk *
>  netdev_dpdk_cast(const struct netdev *netdev)  {
>      return CONTAINER_OF(netdev, struct netdev_dpdk, up); @@ -861,6
> +894,10 @@ netdev_dpdk_destruct(struct netdev *netdev_)
>      rte_free(dev->tx_q);
>      list_remove(&dev->list_node);
>      dpdk_mp_put(dev->dpdk_mp);
> +
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +        dpdk_hw_ofld_port_release(dev); #endif /*
> +DPDK_I40E_TNL_OFFLOAD_ENABLE */
>      ovs_mutex_unlock(&dpdk_mutex);
>  }
> 
> --
> 1.9.1
Jesse Gross March 17, 2016, 11:49 p.m. UTC | #2
On Thu, Mar 17, 2016 at 3:43 PM, Chandran, Sugesh
<sugesh.chandran@intel.com> wrote:
> Hi,
>
> This patch proposes an approach that uses Flow director feature on the Intel Fortville NICs to boost the VxLAN tunneling performance. In our testing we verified that the VxLAN performance is almost doubled with this patch.
> The solution programs the NIC to report the flow ID along with the VxLAN packets, and it is matched by OVS in software. There may be corner cases that needs to addressed in the approach, For eg:  There is a possibility of race condition where NIC reports flow ID that may match on different flow in OVS. This happen when a rule is evicted by a new rule with same flowID+ hash in the OVS software. The packets may hit on wrong new rule in OVS until the flow get deleted in the hardware too.
>
> It is a hardware specific implementation (Only work with Intel Fortville NICs) for now, however the proposal works with any programmable NICs.This RFC proves that the OVS can offer very high speed tunneling performance using flow programmability in NICs. I am looking for comments/suggestions on adding this support(such as configuring, enable it for all the programmable NICs and etc) in OVS userspace datapath for improving the performance.

This is definitely very interesting to see. Can you post some more
specific performance numbers?

Is this really specific to VXLAN? I'm sure that it could be
generalized to other tunneling protocols (Geneve would be nice given
that OVN is using it and I know Fortville supports it). But shouldn't
it apply to non-tunneled traffic as well?

It looks like this is adding a hardware flow when a new flow is added
to the datapath. How does this affect flow setup performance?

Otherwise, I agree that it will be important to generalize this for
the final version. The obvious things that I noticed were supporting
different types of NICs, using the configured tunnel UDP port(s)
rather than a hardcoded one, and integrating the code so that #ifdefs
aren't necessary.
Ben Pfaff March 18, 2016, 2:53 a.m. UTC | #3
This seems really, really specific to the particular NIC.  Can you add a
generic tunnel offload interface to DPDK?  What would that look like?

On Thu, Mar 17, 2016 at 10:43:42PM +0000, Chandran, Sugesh wrote:
> Hi,
> 
> This patch proposes an approach that uses Flow director feature on the Intel Fortville NICs to boost the VxLAN tunneling performance. In our testing we verified that the VxLAN performance is almost doubled with this patch. 
> The solution programs the NIC to report the flow ID along with the VxLAN packets, and it is matched by OVS in software. There may be corner cases that needs to addressed in the approach, For eg:  There is a possibility of race condition where NIC reports flow ID that may match on different flow in OVS. This happen when a rule is evicted by a new rule with same flowID+ hash in the OVS software. The packets may hit on wrong new rule in OVS until the flow get deleted in the hardware too.
> 
> It is a hardware specific implementation (Only work with Intel Fortville NICs) for now, however the proposal works with any programmable NICs.This RFC proves that the OVS can offer very high speed tunneling performance using flow programmability in NICs. I am looking for comments/suggestions on adding this support(such as configuring, enable it for all the programmable NICs and etc) in OVS userspace datapath for improving the performance.
> 
> Regards
> _Sugesh
> 
> 
> > -----Original Message-----
> > From: Chandran, Sugesh
> > Sent: Thursday, March 17, 2016 10:00 PM
> > To: dev@openvswitch.org
> > Cc: Chandran, Sugesh <sugesh.chandran@intel.com>
> > Subject: [RFC PATCH] tunneling: Improving vxlan performance using DPDK
> > flow director feature.
> > 
> > Optimizing vxlan tunneling performance in userspace datapath using flow
> > director feature in Fortville NIC DPDK ports. OVS uses metadata  reported by
> > NIC to improve the flow lookup performance on VxLAN  packets.
> > 
> > Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
> > ---
> >  lib/automake.mk      |   2 +
> >  lib/dpdk-i40e-ofld.c | 266
> > +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  lib/dpdk-i40e-ofld.h |  59 ++++++++++++
> >  lib/dpif-netdev.c    | 118 ++++++++++++++++++++++-
> >  lib/netdev-dpdk.c    |  41 +++++++-
> >  5 files changed, 481 insertions(+), 5 deletions(-)  create mode 100644
> > lib/dpdk-i40e-ofld.c  create mode 100644 lib/dpdk-i40e-ofld.h
> > 
> > diff --git a/lib/automake.mk b/lib/automake.mk index 27a1669..da48479
> > 100644
> > --- a/lib/automake.mk
> > +++ b/lib/automake.mk
> > @@ -366,6 +366,8 @@ endif
> > 
> >  if DPDK_NETDEV
> >  lib_libopenvswitch_la_SOURCES += \
> > +       lib/dpdk-i40e-ofld.c \
> > +       lib/dpdk-i40e-ofld.h \
> >         lib/netdev-dpdk.c \
> >         lib/netdev-dpdk.h
> >  endif
> > diff --git a/lib/dpdk-i40e-ofld.c b/lib/dpdk-i40e-ofld.c new file mode 100644
> > index 0000000..3ea7084
> > --- /dev/null
> > +++ b/lib/dpdk-i40e-ofld.c
> > @@ -0,0 +1,266 @@
> > +/*
> > + * Copyright (c) 2016 Intel Corp.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#include <config.h>
> > +
> > +#include "dpdk-i40e-ofld.h"
> > +#include "errno.h"
> > +#include "ovs-thread.h"
> > +#include "openvswitch/vlog.h"
> > +#include "netdev-provider.h"
> > +#include "rte_pci_dev_ids.h"
> > +#include "rte_ethdev.h"
> > +
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +VLOG_DEFINE_THIS_MODULE(dpdk_hw_ofld);
> > +
> > +#define VXLAN_DST_PORT          4789
> > +#define VXLAN_HLEN                  50
> > +#define MAX_FDIR_RULES          8000
> > +
> > +static uint32_t total_fdir_ids;
> > +static struct ovs_mutex hw_ofld_mutex = OVS_MUTEX_INITIALIZER;
> > +
> > +/*
> > + * Returns '0' if FDIR IDs reaches max limit. Only 8000 entries are
> > + * supported in FVL.
> > + */
> > +static inline uint32_t
> > +i40e_fdir_entry_cnt_inc(void)
> > +{
> > +    if (total_fdir_ids < MAX_FDIR_RULES) {
> > +        ovs_mutex_lock(&hw_ofld_mutex);
> > +        total_fdir_ids++;
> > +        ovs_mutex_unlock(&hw_ofld_mutex);
> > +        return (total_fdir_ids);
> > +    }
> > +    return 0;
> > +}
> > +
> > +static inline void
> > +i40e_fdir_entry_cnt_decr(void)
> > +{
> > +    ovs_mutex_lock(&hw_ofld_mutex);
> > +    total_fdir_ids ? total_fdir_ids-- : 0;
> > +    ovs_mutex_unlock(&hw_ofld_mutex);
> > +}
> > +
> > +/*
> > + * Release the hardware offloading functionality from the dpdk-port.
> > + */
> > +int
> > +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port) {
> > +    ovs_mutex_lock(&hw_ofld_mutex);
> > +    set_i40e_ofld_flag(dpdk_port, 0);
> > +    ovs_mutex_unlock(&hw_ofld_mutex);
> > +    return 0;
> > +}
> > +
> > +int
> > +dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev,
> > +                                        int n_rxq, int n_txq,
> > +                                        struct rte_eth_conf *port_conf)
> > +{
> > +    int err = 0;
> > +    struct rte_eth_dev_info info;
> > +    uint16_t vendor_id, device_id;
> > +
> > +    rte_eth_dev_info_get(get_dpdk_port_id(dev), &info);
> > +    vendor_id = info.pci_dev->id.vendor_id;
> > +    device_id = info.pci_dev->id.device_id;
> > +    /* Configure vxlan offload only if its FVL NIC */
> > +    if (vendor_id != PCI_VENDOR_ID_INTEL || device_id !=
> > +                                            I40E_DEV_ID_SFP_XL710) {
> > +        ovs_mutex_lock(&hw_ofld_mutex);
> > +        set_i40e_ofld_flag(dev, 0);
> > +        ovs_mutex_unlock(&hw_ofld_mutex);
> > +        err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> > +                                    n_rxq, n_txq, port_conf);
> > +        return err;
> > +    }
> > +    ovs_mutex_lock(&hw_ofld_mutex);
> > +    set_i40e_ofld_flag(dev, 1);
> > +    ovs_mutex_unlock(&hw_ofld_mutex);
> > +    /* Configure FVL FDIR VxLAN tunnel handing */
> > +    port_conf->fdir_conf.mode = RTE_FDIR_MODE_PERFECT;
> > +    port_conf->fdir_conf.flex_conf.nb_payloads = 1;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].type =
> > RTE_ETH_L4_PAYLOAD;
> > +    /* Need to initilize all the 16 flex bytes,no matter;
> > +     * what we really using, possibly a DPDK bug?? */
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[0] = 0;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[1] = 1;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[2] = 2;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[3] = 3;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[4] = 4;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[5] = 5;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[6] = 6;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[7] = 7;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[8] = 8;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[9] = 9;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[10] = 10;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[11] = 11;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[12] = 12;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[13] = 13;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[14] = 14;
> > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[15] = 15;
> > +    err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> > +                                n_rxq, n_txq, port_conf);
> > +    if (err) {
> > +        VLOG_ERR("Failed to configure DPDK port with hardware offload");
> > +        return err;
> > +    }
> > +    /*Clean all FDIR entries if any */
> > +    err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(dev),
> > +            RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_FLUSH, NULL);
> > +    return err;
> > +}
> > +
> > +/*
> > + * Install rules for VxLAN packets in hardware  */ int
> > +set_up_hw_offload_port_rule(struct netdev *netdev__,
> > +                                const struct flow *flow,
> > +                                const uint32_t hw_flow_id,
> > +                                const bool is_add_rule) {
> > +    int err = 0;
> > +    uint8_t flexbytes[RTE_ETH_FDIR_MAX_FLEXLEN] = { 0 };
> > +    uint32_t *vni;
> > +    enum rte_filter_op filter_op;
> > +    struct rte_eth_fdir_filter entry = { 0 };
> > +    struct netdev_dpdk *netdev;
> > +
> > +    netdev = netdev_dpdk_cast(netdev__);
> > +    if (is_i40e_ofld_enable(netdev)) {
> > +        entry.soft_id = hw_flow_id;
> > +        if (!entry.soft_id) {
> > +            VLOG_DBG("Invalid flow ID, Cant install rule in the NIC for "
> > +                             "hardware offload");
> > +            err = ECANCELED;
> > +            return err;
> > +        }
> > +        /* Install rules in NIC only for VxLAN flows */
> > +        if (ntohs(flow->tp_dst) != VXLAN_DST_PORT) {
> > +            return 0;
> > +        }
> > +        entry.action.flex_off = 0;  /* use 0 by default */
> > +        entry.input.flow_ext.vlan_tci = 0; //! ignored by i40e fdir
> > +        entry.action.behavior = RTE_ETH_FDIR_PASSTHRU;
> > +        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID_FLEX_4;
> > +        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
> > +        entry.input.flow.ip4_flow.src_ip = flow->nw_src;
> > +        entry.input.flow.ip4_flow.dst_ip = flow->nw_dst;
> > +        entry.input.flow.udp4_flow.dst_port = htons(VXLAN_DST_PORT);
> > +        entry.input.flow.udp4_flow.src_port = flow->tp_src;
> > +        vni = (uint32_t *)&flexbytes[4];
> > +        *vni = flow->tunnel.tun_id << 8;
> > +        memcpy(entry.input.flow_ext.flexbytes, flexbytes,
> > +                      RTE_ETH_FDIR_MAX_FLEXLEN);
> > +        entry.action.rx_queue = 0;
> > +        filter_op = is_add_rule ? RTE_ETH_FILTER_ADD :
> > +                                              RTE_ETH_FILTER_DELETE;
> > +        err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(netdev),
> > +                 RTE_ETH_FILTER_FDIR, filter_op, &entry);
> > +
> > +        /*
> > +         * XXX : Delayed the max limit check for flow director entries after
> > +         * the configuration. Anyway the rte_eth_dev_filter_ctrl will fail if
> > +         * max limit reaches. This can be used for tracking.
> > +         */
> > +        if (is_add_rule) {
> > +            if (!i40e_fdir_entry_cnt_inc()) {
> > +                VLOG_DBG("Cant configure rule on NIC, Flow director "
> > +                        "entries hits max limit");
> > +            }
> > +        }
> > +        else {
> > +            i40e_fdir_entry_cnt_decr();
> > +        }
> > +        if (err < 0) {
> > +            VLOG_DBG("flow director programming error in NIC: (%d)\n", err);
> > +            return err;
> > +        }
> > +    }
> > +    return err;
> > +}
> > +
> > +static int
> > +i40e_dpdk_port_get_hw_ofld_pkts(struct
> > +                 dp_netdev_pmd_thread *pmd, struct dp_packet
> > +                 **in_packets, struct dp_packet **hw_packets,
> > +                 struct dp_packet **non_hw_packets,
> > +                 uint32_t cnt)
> > +{
> > +    int i, hw_pkt_cnt = 0, norm_pkt_cnt = 0;
> > +    const struct dp_netdev_flow *flow;
> > +    struct rte_mbuf *mbuf;
> > +
> > +    for (i = 0; i < cnt; i++) {
> > +        mbuf = (struct rte_mbuf *)in_packets[i];
> > +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
> > +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf,
> > +                                                     mbuf->hash.fdir.hi);
> > +            if (!flow) {
> > +                /* Bogus flow in hw, cannot find it in OVS EMC */
> > +                mbuf->ol_flags &= ~PKT_RX_FDIR_ID;
> > +                non_hw_packets[norm_pkt_cnt++] = in_packets[i];
> > +                continue;
> > +            }
> > +            dp_packet_reset_packet(in_packets[i], VXLAN_HLEN);
> > +            mbuf->ol_flags |= PKT_RX_RSS_HASH;
> > +            mbuf->hash.rss = hash_finish(mbuf->hash.rss, 1);
> > +            hw_packets[hw_pkt_cnt++] = in_packets[i];
> > +        }
> > +        else {
> > +            non_hw_packets[norm_pkt_cnt++] = in_packets[i];
> > +        }
> > +    }
> > +    return hw_pkt_cnt;
> > +}
> > +
> > +/*
> > + * Process the packets based on hardware offload configuration  */ void
> > +hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> > +                             struct netdev_rxq *netdev_rxq,
> > +                             struct dp_packet **packets, int cnt,
> > +                             odp_port_t port_no) {
> > +    int hw_pkt_cnt;
> > +    struct dp_packet *hw_ofld_packets[NETDEV_MAX_BURST] = { 0 };
> > +    struct dp_packet *orig_packets[NETDEV_MAX_BURST] = { 0 };
> > +    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_rxq->netdev);
> > +
> > +    if (is_i40e_ofld_enable(netdev)) {
> > +        hw_pkt_cnt = i40e_dpdk_port_get_hw_ofld_pkts(pmd, packets,
> > +                                                          hw_ofld_packets,
> > +                                                          orig_packets, cnt);
> > +        /* Process packet streams separately. */
> > +        if (hw_pkt_cnt) {
> > +            dp_netdev_input(pmd, hw_ofld_packets, hw_pkt_cnt, port_no);
> > +        }
> > +        if (cnt - hw_pkt_cnt) {
> > +            dp_netdev_input(pmd, orig_packets, (cnt - hw_pkt_cnt), port_no);
> > +        }
> > +    }
> > +    else {
> > +        dp_netdev_input(pmd, packets, cnt, port_no);
> > +    }
> > +}
> > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > diff --git a/lib/dpdk-i40e-ofld.h b/lib/dpdk-i40e-ofld.h new file mode 100644
> > index 0000000..1aad246
> > --- /dev/null
> > +++ b/lib/dpdk-i40e-ofld.h
> > @@ -0,0 +1,59 @@
> > +/*
> > + * Copyright (c) 2016 Intel Corp.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifndef DPDK_I40E_OFLD_H_
> > +#define DPDK_I40E_OFLD_H_
> > +
> > +#include <config.h>
> > +
> > +#include "dp-packet.h"
> > +#include "netdev.h"
> > +
> > +/*
> > + * Macro to enable/disable HW OFFLOAD feature for DPDK.
> > + * 1 :- Enable HW_OFFLOAD support in OVS
> > + * 0 :- Disable HW_OFFLOAD support in OVS  */
> > +#define DPDK_I40E_TNL_OFFLOAD_ENABLE        1
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +
> > +struct netdev_dpdk;
> > +struct dp_netdev_pmd_thread;
> > +struct dp_netdev_flow;
> > +
> > +struct netdev_dpdk *netdev_dpdk_cast(const struct netdev *netdev);
> > +extern inline bool is_i40e_ofld_enable(const struct netdev_dpdk
> > +*netdev); extern inline void set_i40e_ofld_flag(struct netdev_dpdk
> > +*netdev, bool flag); extern inline int get_dpdk_port_id(struct
> > +netdev_dpdk *dpdk_port); int dpdk_eth_dev_hw_ofld_init(struct
> > netdev_dpdk *dev, int n_rxq, int n_txq,
> > +                              struct rte_eth_conf *port_conf); int
> > +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port); int
> > +set_up_hw_offload_port_rule(struct netdev *netdev__,
> > +                                const struct flow *flow,
> > +                                const uint32_t hw_flow_id,
> > +                                const bool is_add_rule); void
> > +hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> > +                             struct netdev_rxq *netdev_rxq,
> > +                             struct dp_packet **packets, int cnt,
> > +                             odp_port_t port_no); const struct
> > +dp_netdev_flow *lookup_hw_offload_flow_for_fdirid(
> > +                            const struct dp_netdev_pmd_thread *pmd,
> > +                            struct rte_mbuf *mbuf, uint32_t flow_id);
> > +void dp_netdev_input(struct dp_netdev_pmd_thread *, struct dp_packet
> > **,
> > +                     int cnt, odp_port_t port_no);
> > +
> > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +#endif /* DPDK_I40E_OFLD_H_ */
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index cf574ad..d79b239
> > 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -70,6 +70,7 @@
> >  #include "util.h"
> > 
> >  #include "openvswitch/vlog.h"
> > +#include "dpdk-i40e-ofld.h"
> > 
> >  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> > 
> > @@ -478,7 +479,7 @@ static void dp_netdev_execute_actions(struct
> > dp_netdev_pmd_thread *pmd,
> >                                        bool may_steal,
> >                                        const struct nlattr *actions,
> >                                        size_t actions_len); -static void dp_netdev_input(struct
> > dp_netdev_pmd_thread *,
> > +void dp_netdev_input(struct dp_netdev_pmd_thread *,
> >                              struct dp_packet **, int cnt, odp_port_t port_no);  static void
> > dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
> >                                    struct dp_packet **, int cnt); @@ -1455,6 +1456,28 @@
> > dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
> >      flow->dead = true;
> > 
> >      dp_netdev_flow_unref(flow);
> > +
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +    struct dp_netdev_port *dp_port;
> > +    int err;
> > +    odp_port_t in_port = flow->flow.in_port.odp_port;
> > +    err = get_port_by_number(pmd->dp, in_port, &dp_port);
> > +    if (err) {
> > +        VLOG_WARN("Cannot get the port information, hardware offload may
> > "
> > +                "not be functional");
> > +        return;
> > +    }
> > +    if(strcmp(dp_port->type, "dpdk")) {
> > +        /* No hardware offload on a non-DPDK port") */
> > +        return;
> > +    }
> > +    /* Remove the hardware offload rule if exists.*/
> > +    if(set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
> > +            dp_netdev_flow_hash(&(flow->ufid)), 0)) {
> > +        VLOG_DBG("Failed to delete the hardware offload rule");
> > +        return;
> > +    }
> > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> >  }
> > 
> >  static void
> > @@ -2059,6 +2082,32 @@ dp_netdev_flow_add(struct
> > dp_netdev_pmd_thread *pmd,
> >          ds_destroy(&ds);
> >      }
> > 
> > +    /*
> > +     * Configure the hardware offload for tunnel while flows are getting
> > +     * inserted in OVS.
> > +     */
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +    struct dp_netdev_port *dp_port;
> > +    int err;
> > +    odp_port_t in_port = flow->flow.in_port.odp_port;
> > +    err = get_port_by_number(pmd->dp, in_port, &dp_port);
> > +    if (err) {
> > +        VLOG_WARN("Cannot get the port information, Failed to configure "
> > +                            "hardware offload");
> > +        goto out;
> > +    }
> > +    if (strcmp(dp_port->type, "dpdk")) {
> > +        /* No hardware offload on a non-DPDK port */
> > +        goto out;
> > +    }
> > +    /* install the rule in hw, reduntant might overwrite if it exists*/
> > +    if (set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
> > +            dp_netdev_flow_hash(&flow->ufid), 1)) {
> > +        VLOG_ERR("Failed to install the hardware offload rule");
> > +        goto out;
> > +    }
> > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +out:
> >      return flow;
> >  }
> > 
> > @@ -2575,7 +2624,19 @@ dp_netdev_process_rxq_port(struct
> > dp_netdev_pmd_thread *pmd,
> >          *recirc_depth_get() = 0;
> > 
> >          cycles_count_start(pmd);
> > +
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +        /* Check if the source port is DPDK */
> > +        if (packets[0]->source == DPBUF_DPDK) {
> > +            hw_ofld_dp_netdev_input(pmd, rxq, packets, cnt, port->port_no);
> > +        }
> > +        else {
> > +            dp_netdev_input(pmd, packets, cnt, port->port_no);
> > +        }
> > +#else
> >          dp_netdev_input(pmd, packets, cnt, port->port_no);
> > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +
> >          cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
> >      } else if (error != EAGAIN && error != EOPNOTSUPP) {
> >          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); @@ -
> > 3321,7 +3382,6 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread
> > *pmd, struct dp_packet *packet_,
> >          flow->tunnel.metadata.present.len =
> > orig_tunnel.metadata.present.len;
> >          flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
> >      }
> > -
> >      return err;
> >  }
> > 
> > @@ -3430,6 +3490,7 @@ emc_processing(struct dp_netdev_pmd_thread
> > *pmd, struct dp_packet **packets,
> >      struct emc_cache *flow_cache = &pmd->flow_cache;
> >      struct netdev_flow_key *key = &keys[0];
> >      size_t i, n_missed = 0, n_dropped = 0;
> > +    struct rte_mbuf *mbuf;
> > 
> >      for (i = 0; i < cnt; i++) {
> >          struct dp_netdev_flow *flow;
> > @@ -3454,7 +3515,18 @@ emc_processing(struct dp_netdev_pmd_thread
> > *pmd, struct dp_packet **packets,
> >          key->len = 0; /* Not computed yet. */
> >          key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
> > 
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +        mbuf = (struct rte_mbuf *)packet;
> > +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
> > +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf, 0);
> > +        }
> > +        else {
> > +            flow = emc_lookup(flow_cache, key);
> > +        }
> > +#else
> >          flow = emc_lookup(flow_cache, key);
> > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +
> >          if (OVS_LIKELY(flow)) {
> >              dp_netdev_queue_batches(packet, flow, &key->mf, batches,
> >                                      n_batches); @@ -3651,7 +3723,7 @@
> > dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
> >      }
> >  }
> > 
> > -static void
> > +void
> >  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> >                  struct dp_packet **packets, int cnt,
> >                  odp_port_t port_no)
> > @@ -4290,3 +4362,43 @@ dpcls_lookup(const struct dpcls *cls, const struct
> > netdev_flow_key keys[],
> >      }
> >      return false;                     /* Some misses. */
> >  }
> > +
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +/*
> > + * EMC lookup function on 'flow id' reported by NIC.
> > + */
> > +const struct dp_netdev_flow *
> > +lookup_hw_offload_flow_for_fdirid(const struct
> > +                 dp_netdev_pmd_thread *pmd, struct rte_mbuf *mbuf,
> > +                 uint32_t flow_id)
> > +{
> > +    const struct emc_cache *flow_cache = &pmd->flow_cache;
> > +    struct netdev_flow_key key;
> > +    struct emc_entry *current_entry;
> > +
> > +    key.len = 0;
> > +    if (OVS_LIKELY(mbuf->ol_flags & PKT_RX_RSS_HASH)) {
> > +        key.hash = mbuf->hash.rss;
> > +    }
> > +    else {
> > +        return NULL;
> > +    }
> > +    EMC_FOR_EACH_POS_WITH_HASH(flow_cache, current_entry,
> > key.hash) {
> > +        if (current_entry->key.hash == key.hash
> > +            && emc_entry_alive(current_entry)) {
> > +            if (OVS_UNLIKELY(flow_id && dp_netdev_flow_hash(
> > +                                       &current_entry->flow->ufid) !=
> > +                                       flow_id)) {
> > +                /* Hash collision in emc, fallback to software path */
> > +                return NULL;
> > +            }
> > +            return current_entry->flow;
> > +        }
> > +    }
> > +    /* XXX :: An improved classifier lookup needed here without any miniflow
> > +     * extract to keep it performant.Until then fallback to software based
> > +     * packet forwarding on EMC miss.
> > +     */
> > +     return NULL;
> > +}
> > +#endif /* DPDK_I40E_TNL_OFFLOAD_ENABLE */
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index f402354..2954f83
> > 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -56,6 +56,7 @@
> >  #include "rte_mbuf.h"
> >  #include "rte_meter.h"
> >  #include "rte_virtio_net.h"
> > +#include "dpdk-i40e-ofld.h"
> > 
> >  VLOG_DEFINE_THIS_MODULE(dpdk);
> >  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> > @@ -112,7 +113,7 @@ static char *vhost_sock_dir = NULL;   /* Location of
> > vhost-user sockets */
> >   */
> >  #define VHOST_ENQ_RETRY_USECS 100
> > 
> > -static const struct rte_eth_conf port_conf = {
> > +static struct rte_eth_conf port_conf = {
> >      .rxmode = {
> >          .mq_mode = ETH_MQ_RX_RSS,
> >          .split_hdr_size = 0,
> > @@ -331,6 +332,9 @@ struct netdev_dpdk {
> > 
> >      /* Identifier used to distinguish vhost devices from each other */
> >      char vhost_id[PATH_MAX];
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +    bool i40e_ofld_enable; /* hardware/NIC offload flag*/ #endif
> > +//DPDK_I40E_TNL_OFFLOAD_ENABLE
> > 
> >      /* In dpdk_list. */
> >      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -346,6
> > +350,24 @@ struct netdev_rxq_dpdk {
> >      int port_id;
> >  };
> > 
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +inline bool is_i40e_ofld_enable(const struct netdev_dpdk *netdev) {
> > +    return netdev->i40e_ofld_enable;
> > +}
> > +
> > +inline void set_i40e_ofld_flag(struct netdev_dpdk *netdev,
> > +                                                bool flag) {
> > +    netdev->i40e_ofld_enable = flag;
> > +}
> > +
> > +inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port) {
> > +    return dpdk_port->port_id;
> > +}
> > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +
> >  static bool dpdk_thread_is_pmd(void);
> > 
> >  static int netdev_dpdk_construct(struct netdev *); @@ -539,10 +561,21 @@
> > dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int
> > n_txq)
> >              VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
> >          }
> > 
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +        diag = (!dev->i40e_ofld_enable && dev->type == DPDK_DEV_ETH) ?
> > +                    dpdk_eth_dev_hw_ofld_init(dev, n_rxq, n_txq, &port_conf) :
> > +                    rte_eth_dev_configure(dev->port_id,
> > +                    n_rxq, n_txq, &port_conf);
> > +        if (diag) {
> > +            /* rte_dev_configure error */
> > +            break;
> > +        }
> > +#else
> >          diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
> >          if (diag) {
> >              break;
> >          }
> > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > 
> >          for (i = 0; i < n_txq; i++) {
> >              diag = rte_eth_tx_queue_setup(dev->port_id, i,
> > NIC_PORT_TX_Q_SIZE, @@ -637,7 +670,7 @@ dpdk_eth_dev_init(struct
> > netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
> >      return 0;
> >  }
> > 
> > -static struct netdev_dpdk *
> > +struct netdev_dpdk *
> >  netdev_dpdk_cast(const struct netdev *netdev)  {
> >      return CONTAINER_OF(netdev, struct netdev_dpdk, up); @@ -861,6
> > +894,10 @@ netdev_dpdk_destruct(struct netdev *netdev_)
> >      rte_free(dev->tx_q);
> >      list_remove(&dev->list_node);
> >      dpdk_mp_put(dev->dpdk_mp);
> > +
> > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > +        dpdk_hw_ofld_port_release(dev); #endif /*
> > +DPDK_I40E_TNL_OFFLOAD_ENABLE */
> >      ovs_mutex_unlock(&dpdk_mutex);
> >  }
> > 
> > --
> > 1.9.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
Chandran, Sugesh March 18, 2016, 3:50 p.m. UTC | #4
Hi Jesse,
Please find my answers inline.

Regards
_Sugesh


> -----Original Message-----

> From: Jesse Gross [mailto:jesse@kernel.org]

> Sent: Thursday, March 17, 2016 11:50 PM

> To: Chandran, Sugesh <sugesh.chandran@intel.com>

> Cc: dev@openvswitch.org

> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance

> using DPDK flow director feature.

> 

> On Thu, Mar 17, 2016 at 3:43 PM, Chandran, Sugesh

> <sugesh.chandran@intel.com> wrote:

> > Hi,

> >

> > This patch proposes an approach that uses Flow director feature on the

> Intel Fortville NICs to boost the VxLAN tunneling performance. In our testing

> we verified that the VxLAN performance is almost doubled with this patch.

> > The solution programs the NIC to report the flow ID along with the VxLAN

> packets, and it is matched by OVS in software. There may be corner cases

> that needs to addressed in the approach, For eg:  There is a possibility of race

> condition where NIC reports flow ID that may match on different flow in

> OVS. This happen when a rule is evicted by a new rule with same flowID+

> hash in the OVS software. The packets may hit on wrong new rule in OVS

> until the flow get deleted in the hardware too.

> >

> > It is a hardware specific implementation (Only work with Intel Fortville

> NICs) for now, however the proposal works with any programmable

> NICs.This RFC proves that the OVS can offer very high speed tunneling

> performance using flow programmability in NICs. I am looking for

> comments/suggestions on adding this support(such as configuring, enable it

> for all the programmable NICs and etc) in OVS userspace datapath for

> improving the performance.

> 

> This is definitely very interesting to see. Can you post some more specific

> performance numbers?

[Sugesh] 
VxLAN DECAP performance(Unidirectional, Single flow, Single CPU Core)
-------------------------------------------------------------------
PKT-IN - 9.3 Mpps
Pkt size - 114 byte VxLAN Packets(64 byte payload)
PKT-OUT - 5.6 Mpps( Without Optimization)
PKT-OUT - 9.3 Mpps(After the optimization, It hits the Input Line rate)

VxLAN ENCAP-DECAP performance (Bidirectional, Single CPU Core)
---------------------------------------------------------------------------------
PKT-IN - 9.3 Mpps, PKT SIZE - 114 Byte VxLAN Packets (64 Byte payload) -->
PKT-IN - 14 Mpps, PKT SIZE - 64 Byte UDP packets <--

PKT-OUT - 3.6 Mpps(Without Optimization)
PKT-OUT - 5.3 Mpps(Using the patch)

> 

> Is this really specific to VXLAN? I'm sure that it could be generalized to other

> tunneling protocols (Geneve would be nice given that OVN is using it and I

> know Fortville supports it). But shouldn't it apply to non-tunneled traffic as

> well?

Yes, this can be applied for any tunneling protocol provided the NIC
hardware is programmed to handle those packets. 
We haven’t tested it for non-tunneled packets. The performance improvement on
non-tunneled packets are subjective due to the fact that there 
is a limitation on number of hardware flows(8K on FVL), and software still has to 
spend cycles on matching the flow IDs reported by hardware.  This improves the 
tunneling performance in all the cases, because it tunnel packets needs two lookup than one.

> 

> It looks like this is adding a hardware flow when a new flow is added to the

> datapath. How does this affect flow setup performance?

> 

We haven’t performed any stress tests with so many flows to verify the
flow setup performance. What is the expectation here? Currently how many rules can be 
setup per second in OVS ?
> Otherwise, I agree that it will be important to generalize this for the final

> version. The obvious things that I noticed were supporting different types of

> NICs, using the configured tunnel UDP port(s) rather than a hardcoded one,

> and integrating the code so that #ifdefs aren't necessary.

We are looking into the possible options for having generic API set to support this  and more 
updates will follow.
Chandran, Sugesh March 18, 2016, 3:51 p.m. UTC | #5
Hi Ben
Thank you for looking into this.
We are working on a generic tunneling offloading APIs and more updates will follow.

Regards
_Sugesh


> -----Original Message-----
> From: Ben Pfaff [mailto:blp@ovn.org]
> Sent: Friday, March 18, 2016 2:53 AM
> To: Chandran, Sugesh <sugesh.chandran@intel.com>
> Cc: dev@openvswitch.org
> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance
> using DPDK flow director feature.
> 
> This seems really, really specific to the particular NIC.  Can you add a generic
> tunnel offload interface to DPDK?  What would that look like?
> 
> On Thu, Mar 17, 2016 at 10:43:42PM +0000, Chandran, Sugesh wrote:
> > Hi,
> >
> > This patch proposes an approach that uses Flow director feature on the
> Intel Fortville NICs to boost the VxLAN tunneling performance. In our testing
> we verified that the VxLAN performance is almost doubled with this patch.
> > The solution programs the NIC to report the flow ID along with the VxLAN
> packets, and it is matched by OVS in software. There may be corner cases
> that needs to addressed in the approach, For eg:  There is a possibility of race
> condition where NIC reports flow ID that may match on different flow in
> OVS. This happen when a rule is evicted by a new rule with same flowID+
> hash in the OVS software. The packets may hit on wrong new rule in OVS
> until the flow get deleted in the hardware too.
> >
> > It is a hardware specific implementation (Only work with Intel Fortville
> NICs) for now, however the proposal works with any programmable
> NICs.This RFC proves that the OVS can offer very high speed tunneling
> performance using flow programmability in NICs. I am looking for
> comments/suggestions on adding this support(such as configuring, enable it
> for all the programmable NICs and etc) in OVS userspace datapath for
> improving the performance.
> >
> > Regards
> > _Sugesh
> >
> >
> > > -----Original Message-----
> > > From: Chandran, Sugesh
> > > Sent: Thursday, March 17, 2016 10:00 PM
> > > To: dev@openvswitch.org
> > > Cc: Chandran, Sugesh <sugesh.chandran@intel.com>
> > > Subject: [RFC PATCH] tunneling: Improving vxlan performance using
> > > DPDK flow director feature.
> > >
> > > Optimizing vxlan tunneling performance in userspace datapath using
> > > flow director feature in Fortville NIC DPDK ports. OVS uses metadata
> > > reported by NIC to improve the flow lookup performance on VxLAN
> packets.
> > >
> > > Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
> > > ---
> > >  lib/automake.mk      |   2 +
> > >  lib/dpdk-i40e-ofld.c | 266
> > > +++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  lib/dpdk-i40e-ofld.h |  59 ++++++++++++
> > >  lib/dpif-netdev.c    | 118 ++++++++++++++++++++++-
> > >  lib/netdev-dpdk.c    |  41 +++++++-
> > >  5 files changed, 481 insertions(+), 5 deletions(-)  create mode
> > > 100644 lib/dpdk-i40e-ofld.c  create mode 100644 lib/dpdk-i40e-ofld.h
> > >
> > > diff --git a/lib/automake.mk b/lib/automake.mk index
> > > 27a1669..da48479
> > > 100644
> > > --- a/lib/automake.mk
> > > +++ b/lib/automake.mk
> > > @@ -366,6 +366,8 @@ endif
> > >
> > >  if DPDK_NETDEV
> > >  lib_libopenvswitch_la_SOURCES += \
> > > +       lib/dpdk-i40e-ofld.c \
> > > +       lib/dpdk-i40e-ofld.h \
> > >         lib/netdev-dpdk.c \
> > >         lib/netdev-dpdk.h
> > >  endif
> > > diff --git a/lib/dpdk-i40e-ofld.c b/lib/dpdk-i40e-ofld.c new file
> > > mode 100644 index 0000000..3ea7084
> > > --- /dev/null
> > > +++ b/lib/dpdk-i40e-ofld.c
> > > @@ -0,0 +1,266 @@
> > > +/*
> > > + * Copyright (c) 2016 Intel Corp.
> > > + *
> > > + * Licensed under the Apache License, Version 2.0 (the "License");
> > > + * you may not use this file except in compliance with the License.
> > > + * You may obtain a copy of the License at:
> > > + *
> > > + *     http://www.apache.org/licenses/LICENSE-2.0
> > > + *
> > > + * Unless required by applicable law or agreed to in writing,
> > > +software
> > > + * distributed under the License is distributed on an "AS IS"
> > > +BASIS,
> > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
> or
> > > implied.
> > > + * See the License for the specific language governing permissions
> > > + and
> > > + * limitations under the License.
> > > + */
> > > +
> > > +#include <config.h>
> > > +
> > > +#include "dpdk-i40e-ofld.h"
> > > +#include "errno.h"
> > > +#include "ovs-thread.h"
> > > +#include "openvswitch/vlog.h"
> > > +#include "netdev-provider.h"
> > > +#include "rte_pci_dev_ids.h"
> > > +#include "rte_ethdev.h"
> > > +
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +VLOG_DEFINE_THIS_MODULE(dpdk_hw_ofld);
> > > +
> > > +#define VXLAN_DST_PORT          4789
> > > +#define VXLAN_HLEN                  50
> > > +#define MAX_FDIR_RULES          8000
> > > +
> > > +static uint32_t total_fdir_ids;
> > > +static struct ovs_mutex hw_ofld_mutex = OVS_MUTEX_INITIALIZER;
> > > +
> > > +/*
> > > + * Returns '0' if FDIR IDs reaches max limit. Only 8000 entries are
> > > + * supported in FVL.
> > > + */
> > > +static inline uint32_t
> > > +i40e_fdir_entry_cnt_inc(void)
> > > +{
> > > +    if (total_fdir_ids < MAX_FDIR_RULES) {
> > > +        ovs_mutex_lock(&hw_ofld_mutex);
> > > +        total_fdir_ids++;
> > > +        ovs_mutex_unlock(&hw_ofld_mutex);
> > > +        return (total_fdir_ids);
> > > +    }
> > > +    return 0;
> > > +}
> > > +
> > > +static inline void
> > > +i40e_fdir_entry_cnt_decr(void)
> > > +{
> > > +    ovs_mutex_lock(&hw_ofld_mutex);
> > > +    total_fdir_ids ? total_fdir_ids-- : 0;
> > > +    ovs_mutex_unlock(&hw_ofld_mutex); }
> > > +
> > > +/*
> > > + * Release the hardware offloading functionality from the dpdk-port.
> > > + */
> > > +int
> > > +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port) {
> > > +    ovs_mutex_lock(&hw_ofld_mutex);
> > > +    set_i40e_ofld_flag(dpdk_port, 0);
> > > +    ovs_mutex_unlock(&hw_ofld_mutex);
> > > +    return 0;
> > > +}
> > > +
> > > +int
> > > +dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev,
> > > +                                        int n_rxq, int n_txq,
> > > +                                        struct rte_eth_conf
> > > +*port_conf) {
> > > +    int err = 0;
> > > +    struct rte_eth_dev_info info;
> > > +    uint16_t vendor_id, device_id;
> > > +
> > > +    rte_eth_dev_info_get(get_dpdk_port_id(dev), &info);
> > > +    vendor_id = info.pci_dev->id.vendor_id;
> > > +    device_id = info.pci_dev->id.device_id;
> > > +    /* Configure vxlan offload only if its FVL NIC */
> > > +    if (vendor_id != PCI_VENDOR_ID_INTEL || device_id !=
> > > +                                            I40E_DEV_ID_SFP_XL710) {
> > > +        ovs_mutex_lock(&hw_ofld_mutex);
> > > +        set_i40e_ofld_flag(dev, 0);
> > > +        ovs_mutex_unlock(&hw_ofld_mutex);
> > > +        err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> > > +                                    n_rxq, n_txq, port_conf);
> > > +        return err;
> > > +    }
> > > +    ovs_mutex_lock(&hw_ofld_mutex);
> > > +    set_i40e_ofld_flag(dev, 1);
> > > +    ovs_mutex_unlock(&hw_ofld_mutex);
> > > +    /* Configure FVL FDIR VxLAN tunnel handing */
> > > +    port_conf->fdir_conf.mode = RTE_FDIR_MODE_PERFECT;
> > > +    port_conf->fdir_conf.flex_conf.nb_payloads = 1;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].type =
> > > RTE_ETH_L4_PAYLOAD;
> > > +    /* Need to initilize all the 16 flex bytes,no matter;
> > > +     * what we really using, possibly a DPDK bug?? */
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[0] = 0;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[1] = 1;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[2] = 2;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[3] = 3;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[4] = 4;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[5] = 5;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[6] = 6;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[7] = 7;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[8] = 8;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[9] = 9;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[10] = 10;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[11] = 11;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[12] = 12;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[13] = 13;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[14] = 14;
> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[15] = 15;
> > > +    err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> > > +                                n_rxq, n_txq, port_conf);
> > > +    if (err) {
> > > +        VLOG_ERR("Failed to configure DPDK port with hardware offload");
> > > +        return err;
> > > +    }
> > > +    /*Clean all FDIR entries if any */
> > > +    err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(dev),
> > > +            RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_FLUSH, NULL);
> > > +    return err;
> > > +}
> > > +
> > > +/*
> > > + * Install rules for VxLAN packets in hardware  */ int
> > > +set_up_hw_offload_port_rule(struct netdev *netdev__,
> > > +                                const struct flow *flow,
> > > +                                const uint32_t hw_flow_id,
> > > +                                const bool is_add_rule) {
> > > +    int err = 0;
> > > +    uint8_t flexbytes[RTE_ETH_FDIR_MAX_FLEXLEN] = { 0 };
> > > +    uint32_t *vni;
> > > +    enum rte_filter_op filter_op;
> > > +    struct rte_eth_fdir_filter entry = { 0 };
> > > +    struct netdev_dpdk *netdev;
> > > +
> > > +    netdev = netdev_dpdk_cast(netdev__);
> > > +    if (is_i40e_ofld_enable(netdev)) {
> > > +        entry.soft_id = hw_flow_id;
> > > +        if (!entry.soft_id) {
> > > +            VLOG_DBG("Invalid flow ID, Cant install rule in the NIC for "
> > > +                             "hardware offload");
> > > +            err = ECANCELED;
> > > +            return err;
> > > +        }
> > > +        /* Install rules in NIC only for VxLAN flows */
> > > +        if (ntohs(flow->tp_dst) != VXLAN_DST_PORT) {
> > > +            return 0;
> > > +        }
> > > +        entry.action.flex_off = 0;  /* use 0 by default */
> > > +        entry.input.flow_ext.vlan_tci = 0; //! ignored by i40e fdir
> > > +        entry.action.behavior = RTE_ETH_FDIR_PASSTHRU;
> > > +        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID_FLEX_4;
> > > +        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
> > > +        entry.input.flow.ip4_flow.src_ip = flow->nw_src;
> > > +        entry.input.flow.ip4_flow.dst_ip = flow->nw_dst;
> > > +        entry.input.flow.udp4_flow.dst_port = htons(VXLAN_DST_PORT);
> > > +        entry.input.flow.udp4_flow.src_port = flow->tp_src;
> > > +        vni = (uint32_t *)&flexbytes[4];
> > > +        *vni = flow->tunnel.tun_id << 8;
> > > +        memcpy(entry.input.flow_ext.flexbytes, flexbytes,
> > > +                      RTE_ETH_FDIR_MAX_FLEXLEN);
> > > +        entry.action.rx_queue = 0;
> > > +        filter_op = is_add_rule ? RTE_ETH_FILTER_ADD :
> > > +                                              RTE_ETH_FILTER_DELETE;
> > > +        err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(netdev),
> > > +                 RTE_ETH_FILTER_FDIR, filter_op, &entry);
> > > +
> > > +        /*
> > > +         * XXX : Delayed the max limit check for flow director entries after
> > > +         * the configuration. Anyway the rte_eth_dev_filter_ctrl will fail if
> > > +         * max limit reaches. This can be used for tracking.
> > > +         */
> > > +        if (is_add_rule) {
> > > +            if (!i40e_fdir_entry_cnt_inc()) {
> > > +                VLOG_DBG("Cant configure rule on NIC, Flow director "
> > > +                        "entries hits max limit");
> > > +            }
> > > +        }
> > > +        else {
> > > +            i40e_fdir_entry_cnt_decr();
> > > +        }
> > > +        if (err < 0) {
> > > +            VLOG_DBG("flow director programming error in NIC: (%d)\n",
> err);
> > > +            return err;
> > > +        }
> > > +    }
> > > +    return err;
> > > +}
> > > +
> > > +static int
> > > +i40e_dpdk_port_get_hw_ofld_pkts(struct
> > > +                 dp_netdev_pmd_thread *pmd, struct dp_packet
> > > +                 **in_packets, struct dp_packet **hw_packets,
> > > +                 struct dp_packet **non_hw_packets,
> > > +                 uint32_t cnt)
> > > +{
> > > +    int i, hw_pkt_cnt = 0, norm_pkt_cnt = 0;
> > > +    const struct dp_netdev_flow *flow;
> > > +    struct rte_mbuf *mbuf;
> > > +
> > > +    for (i = 0; i < cnt; i++) {
> > > +        mbuf = (struct rte_mbuf *)in_packets[i];
> > > +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
> > > +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf,
> > > +                                                     mbuf->hash.fdir.hi);
> > > +            if (!flow) {
> > > +                /* Bogus flow in hw, cannot find it in OVS EMC */
> > > +                mbuf->ol_flags &= ~PKT_RX_FDIR_ID;
> > > +                non_hw_packets[norm_pkt_cnt++] = in_packets[i];
> > > +                continue;
> > > +            }
> > > +            dp_packet_reset_packet(in_packets[i], VXLAN_HLEN);
> > > +            mbuf->ol_flags |= PKT_RX_RSS_HASH;
> > > +            mbuf->hash.rss = hash_finish(mbuf->hash.rss, 1);
> > > +            hw_packets[hw_pkt_cnt++] = in_packets[i];
> > > +        }
> > > +        else {
> > > +            non_hw_packets[norm_pkt_cnt++] = in_packets[i];
> > > +        }
> > > +    }
> > > +    return hw_pkt_cnt;
> > > +}
> > > +
> > > +/*
> > > + * Process the packets based on hardware offload configuration  */
> > > +void hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> > > +                             struct netdev_rxq *netdev_rxq,
> > > +                             struct dp_packet **packets, int cnt,
> > > +                             odp_port_t port_no) {
> > > +    int hw_pkt_cnt;
> > > +    struct dp_packet *hw_ofld_packets[NETDEV_MAX_BURST] = { 0 };
> > > +    struct dp_packet *orig_packets[NETDEV_MAX_BURST] = { 0 };
> > > +    struct netdev_dpdk *netdev =
> > > +netdev_dpdk_cast(netdev_rxq->netdev);
> > > +
> > > +    if (is_i40e_ofld_enable(netdev)) {
> > > +        hw_pkt_cnt = i40e_dpdk_port_get_hw_ofld_pkts(pmd, packets,
> > > +                                                          hw_ofld_packets,
> > > +                                                          orig_packets, cnt);
> > > +        /* Process packet streams separately. */
> > > +        if (hw_pkt_cnt) {
> > > +            dp_netdev_input(pmd, hw_ofld_packets, hw_pkt_cnt, port_no);
> > > +        }
> > > +        if (cnt - hw_pkt_cnt) {
> > > +            dp_netdev_input(pmd, orig_packets, (cnt - hw_pkt_cnt),
> port_no);
> > > +        }
> > > +    }
> > > +    else {
> > > +        dp_netdev_input(pmd, packets, cnt, port_no);
> > > +    }
> > > +}
> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > diff --git a/lib/dpdk-i40e-ofld.h b/lib/dpdk-i40e-ofld.h new file
> > > mode 100644 index 0000000..1aad246
> > > --- /dev/null
> > > +++ b/lib/dpdk-i40e-ofld.h
> > > @@ -0,0 +1,59 @@
> > > +/*
> > > + * Copyright (c) 2016 Intel Corp.
> > > + *
> > > + * Licensed under the Apache License, Version 2.0 (the "License");
> > > + * you may not use this file except in compliance with the License.
> > > + * You may obtain a copy of the License at:
> > > + *
> > > + *     http://www.apache.org/licenses/LICENSE-2.0
> > > + *
> > > + * Unless required by applicable law or agreed to in writing,
> > > +software
> > > + * distributed under the License is distributed on an "AS IS"
> > > +BASIS,
> > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
> or
> > > implied.
> > > + * See the License for the specific language governing permissions
> > > + and
> > > + * limitations under the License.
> > > + */
> > > +
> > > +#ifndef DPDK_I40E_OFLD_H_
> > > +#define DPDK_I40E_OFLD_H_
> > > +
> > > +#include <config.h>
> > > +
> > > +#include "dp-packet.h"
> > > +#include "netdev.h"
> > > +
> > > +/*
> > > + * Macro to enable/disable HW OFFLOAD feature for DPDK.
> > > + * 1 :- Enable HW_OFFLOAD support in OVS
> > > + * 0 :- Disable HW_OFFLOAD support in OVS  */
> > > +#define DPDK_I40E_TNL_OFFLOAD_ENABLE        1
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +
> > > +struct netdev_dpdk;
> > > +struct dp_netdev_pmd_thread;
> > > +struct dp_netdev_flow;
> > > +
> > > +struct netdev_dpdk *netdev_dpdk_cast(const struct netdev *netdev);
> > > +extern inline bool is_i40e_ofld_enable(const struct netdev_dpdk
> > > +*netdev); extern inline void set_i40e_ofld_flag(struct netdev_dpdk
> > > +*netdev, bool flag); extern inline int get_dpdk_port_id(struct
> > > +netdev_dpdk *dpdk_port); int dpdk_eth_dev_hw_ofld_init(struct
> > > netdev_dpdk *dev, int n_rxq, int n_txq,
> > > +                              struct rte_eth_conf *port_conf); int
> > > +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port); int
> > > +set_up_hw_offload_port_rule(struct netdev *netdev__,
> > > +                                const struct flow *flow,
> > > +                                const uint32_t hw_flow_id,
> > > +                                const bool is_add_rule); void
> > > +hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> > > +                             struct netdev_rxq *netdev_rxq,
> > > +                             struct dp_packet **packets, int cnt,
> > > +                             odp_port_t port_no); const struct
> > > +dp_netdev_flow *lookup_hw_offload_flow_for_fdirid(
> > > +                            const struct dp_netdev_pmd_thread *pmd,
> > > +                            struct rte_mbuf *mbuf, uint32_t
> > > +flow_id); void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > > +struct dp_packet
> > > **,
> > > +                     int cnt, odp_port_t port_no);
> > > +
> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE #endif /*
> DPDK_I40E_OFLD_H_
> > > +*/
> > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> > > cf574ad..d79b239
> > > 100644
> > > --- a/lib/dpif-netdev.c
> > > +++ b/lib/dpif-netdev.c
> > > @@ -70,6 +70,7 @@
> > >  #include "util.h"
> > >
> > >  #include "openvswitch/vlog.h"
> > > +#include "dpdk-i40e-ofld.h"
> > >
> > >  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> > >
> > > @@ -478,7 +479,7 @@ static void dp_netdev_execute_actions(struct
> > > dp_netdev_pmd_thread *pmd,
> > >                                        bool may_steal,
> > >                                        const struct nlattr *actions,
> > >                                        size_t actions_len); -static
> > > void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > > +void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > >                              struct dp_packet **, int cnt,
> > > odp_port_t port_no);  static void dp_netdev_recirculate(struct
> dp_netdev_pmd_thread *,
> > >                                    struct dp_packet **, int cnt); @@
> > > -1455,6 +1456,28 @@ dp_netdev_pmd_remove_flow(struct
> dp_netdev_pmd_thread *pmd,
> > >      flow->dead = true;
> > >
> > >      dp_netdev_flow_unref(flow);
> > > +
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +    struct dp_netdev_port *dp_port;
> > > +    int err;
> > > +    odp_port_t in_port = flow->flow.in_port.odp_port;
> > > +    err = get_port_by_number(pmd->dp, in_port, &dp_port);
> > > +    if (err) {
> > > +        VLOG_WARN("Cannot get the port information, hardware
> > > +offload may
> > > "
> > > +                "not be functional");
> > > +        return;
> > > +    }
> > > +    if(strcmp(dp_port->type, "dpdk")) {
> > > +        /* No hardware offload on a non-DPDK port") */
> > > +        return;
> > > +    }
> > > +    /* Remove the hardware offload rule if exists.*/
> > > +    if(set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
> > > +            dp_netdev_flow_hash(&(flow->ufid)), 0)) {
> > > +        VLOG_DBG("Failed to delete the hardware offload rule");
> > > +        return;
> > > +    }
> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > >  }
> > >
> > >  static void
> > > @@ -2059,6 +2082,32 @@ dp_netdev_flow_add(struct
> > > dp_netdev_pmd_thread *pmd,
> > >          ds_destroy(&ds);
> > >      }
> > >
> > > +    /*
> > > +     * Configure the hardware offload for tunnel while flows are getting
> > > +     * inserted in OVS.
> > > +     */
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +    struct dp_netdev_port *dp_port;
> > > +    int err;
> > > +    odp_port_t in_port = flow->flow.in_port.odp_port;
> > > +    err = get_port_by_number(pmd->dp, in_port, &dp_port);
> > > +    if (err) {
> > > +        VLOG_WARN("Cannot get the port information, Failed to configure
> "
> > > +                            "hardware offload");
> > > +        goto out;
> > > +    }
> > > +    if (strcmp(dp_port->type, "dpdk")) {
> > > +        /* No hardware offload on a non-DPDK port */
> > > +        goto out;
> > > +    }
> > > +    /* install the rule in hw, reduntant might overwrite if it exists*/
> > > +    if (set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
> > > +            dp_netdev_flow_hash(&flow->ufid), 1)) {
> > > +        VLOG_ERR("Failed to install the hardware offload rule");
> > > +        goto out;
> > > +    }
> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +out:
> > >      return flow;
> > >  }
> > >
> > > @@ -2575,7 +2624,19 @@ dp_netdev_process_rxq_port(struct
> > > dp_netdev_pmd_thread *pmd,
> > >          *recirc_depth_get() = 0;
> > >
> > >          cycles_count_start(pmd);
> > > +
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +        /* Check if the source port is DPDK */
> > > +        if (packets[0]->source == DPBUF_DPDK) {
> > > +            hw_ofld_dp_netdev_input(pmd, rxq, packets, cnt, port-
> >port_no);
> > > +        }
> > > +        else {
> > > +            dp_netdev_input(pmd, packets, cnt, port->port_no);
> > > +        }
> > > +#else
> > >          dp_netdev_input(pmd, packets, cnt, port->port_no);
> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +
> > >          cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
> > >      } else if (error != EAGAIN && error != EOPNOTSUPP) {
> > >          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1,
> > > 5); @@ -
> > > 3321,7 +3382,6 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread
> *pmd,
> > > struct dp_packet *packet_,
> > >          flow->tunnel.metadata.present.len =
> > > orig_tunnel.metadata.present.len;
> > >          flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
> > >      }
> > > -
> > >      return err;
> > >  }
> > >
> > > @@ -3430,6 +3490,7 @@ emc_processing(struct
> dp_netdev_pmd_thread
> > > *pmd, struct dp_packet **packets,
> > >      struct emc_cache *flow_cache = &pmd->flow_cache;
> > >      struct netdev_flow_key *key = &keys[0];
> > >      size_t i, n_missed = 0, n_dropped = 0;
> > > +    struct rte_mbuf *mbuf;
> > >
> > >      for (i = 0; i < cnt; i++) {
> > >          struct dp_netdev_flow *flow; @@ -3454,7 +3515,18 @@
> > > emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet
> > > **packets,
> > >          key->len = 0; /* Not computed yet. */
> > >          key->hash = dpif_netdev_packet_get_rss_hash(packet,
> > > &key->mf);
> > >
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +        mbuf = (struct rte_mbuf *)packet;
> > > +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
> > > +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf, 0);
> > > +        }
> > > +        else {
> > > +            flow = emc_lookup(flow_cache, key);
> > > +        }
> > > +#else
> > >          flow = emc_lookup(flow_cache, key);
> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +
> > >          if (OVS_LIKELY(flow)) {
> > >              dp_netdev_queue_batches(packet, flow, &key->mf, batches,
> > >                                      n_batches); @@ -3651,7 +3723,7
> > > @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
> > >      }
> > >  }
> > >
> > > -static void
> > > +void
> > >  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> > >                  struct dp_packet **packets, int cnt,
> > >                  odp_port_t port_no) @@ -4290,3 +4362,43 @@
> > > dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key
> > > keys[],
> > >      }
> > >      return false;                     /* Some misses. */
> > >  }
> > > +
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +/*
> > > + * EMC lookup function on 'flow id' reported by NIC.
> > > + */
> > > +const struct dp_netdev_flow *
> > > +lookup_hw_offload_flow_for_fdirid(const struct
> > > +                 dp_netdev_pmd_thread *pmd, struct rte_mbuf *mbuf,
> > > +                 uint32_t flow_id)
> > > +{
> > > +    const struct emc_cache *flow_cache = &pmd->flow_cache;
> > > +    struct netdev_flow_key key;
> > > +    struct emc_entry *current_entry;
> > > +
> > > +    key.len = 0;
> > > +    if (OVS_LIKELY(mbuf->ol_flags & PKT_RX_RSS_HASH)) {
> > > +        key.hash = mbuf->hash.rss;
> > > +    }
> > > +    else {
> > > +        return NULL;
> > > +    }
> > > +    EMC_FOR_EACH_POS_WITH_HASH(flow_cache, current_entry,
> > > key.hash) {
> > > +        if (current_entry->key.hash == key.hash
> > > +            && emc_entry_alive(current_entry)) {
> > > +            if (OVS_UNLIKELY(flow_id && dp_netdev_flow_hash(
> > > +                                       &current_entry->flow->ufid) !=
> > > +                                       flow_id)) {
> > > +                /* Hash collision in emc, fallback to software path */
> > > +                return NULL;
> > > +            }
> > > +            return current_entry->flow;
> > > +        }
> > > +    }
> > > +    /* XXX :: An improved classifier lookup needed here without any
> miniflow
> > > +     * extract to keep it performant.Until then fallback to software based
> > > +     * packet forwarding on EMC miss.
> > > +     */
> > > +     return NULL;
> > > +}
> > > +#endif /* DPDK_I40E_TNL_OFFLOAD_ENABLE */
> > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> > > f402354..2954f83
> > > 100644
> > > --- a/lib/netdev-dpdk.c
> > > +++ b/lib/netdev-dpdk.c
> > > @@ -56,6 +56,7 @@
> > >  #include "rte_mbuf.h"
> > >  #include "rte_meter.h"
> > >  #include "rte_virtio_net.h"
> > > +#include "dpdk-i40e-ofld.h"
> > >
> > >  VLOG_DEFINE_THIS_MODULE(dpdk);
> > >  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> > > @@ -112,7 +113,7 @@ static char *vhost_sock_dir = NULL;   /* Location
> of
> > > vhost-user sockets */
> > >   */
> > >  #define VHOST_ENQ_RETRY_USECS 100
> > >
> > > -static const struct rte_eth_conf port_conf = {
> > > +static struct rte_eth_conf port_conf = {
> > >      .rxmode = {
> > >          .mq_mode = ETH_MQ_RX_RSS,
> > >          .split_hdr_size = 0,
> > > @@ -331,6 +332,9 @@ struct netdev_dpdk {
> > >
> > >      /* Identifier used to distinguish vhost devices from each other */
> > >      char vhost_id[PATH_MAX];
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +    bool i40e_ofld_enable; /* hardware/NIC offload flag*/ #endif
> > > +//DPDK_I40E_TNL_OFFLOAD_ENABLE
> > >
> > >      /* In dpdk_list. */
> > >      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -346,6
> > > +350,24 @@ struct netdev_rxq_dpdk {
> > >      int port_id;
> > >  };
> > >
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE inline bool
> > > +is_i40e_ofld_enable(const struct netdev_dpdk *netdev) {
> > > +    return netdev->i40e_ofld_enable; }
> > > +
> > > +inline void set_i40e_ofld_flag(struct netdev_dpdk *netdev,
> > > +                                                bool flag) {
> > > +    netdev->i40e_ofld_enable = flag; }
> > > +
> > > +inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port) {
> > > +    return dpdk_port->port_id;
> > > +}
> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +
> > >  static bool dpdk_thread_is_pmd(void);
> > >
> > >  static int netdev_dpdk_construct(struct netdev *); @@ -539,10
> > > +561,21 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
> > > n_rxq, int
> > > n_txq)
> > >              VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
> > >          }
> > >
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +        diag = (!dev->i40e_ofld_enable && dev->type == DPDK_DEV_ETH)
> ?
> > > +                    dpdk_eth_dev_hw_ofld_init(dev, n_rxq, n_txq, &port_conf)
> :
> > > +                    rte_eth_dev_configure(dev->port_id,
> > > +                    n_rxq, n_txq, &port_conf);
> > > +        if (diag) {
> > > +            /* rte_dev_configure error */
> > > +            break;
> > > +        }
> > > +#else
> > >          diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,
> &port_conf);
> > >          if (diag) {
> > >              break;
> > >          }
> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > >
> > >          for (i = 0; i < n_txq; i++) {
> > >              diag = rte_eth_tx_queue_setup(dev->port_id, i,
> > > NIC_PORT_TX_Q_SIZE, @@ -637,7 +670,7 @@ dpdk_eth_dev_init(struct
> > > netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
> > >      return 0;
> > >  }
> > >
> > > -static struct netdev_dpdk *
> > > +struct netdev_dpdk *
> > >  netdev_dpdk_cast(const struct netdev *netdev)  {
> > >      return CONTAINER_OF(netdev, struct netdev_dpdk, up); @@ -861,6
> > > +894,10 @@ netdev_dpdk_destruct(struct netdev *netdev_)
> > >      rte_free(dev->tx_q);
> > >      list_remove(&dev->list_node);
> > >      dpdk_mp_put(dev->dpdk_mp);
> > > +
> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > +        dpdk_hw_ofld_port_release(dev); #endif /*
> > > +DPDK_I40E_TNL_OFFLOAD_ENABLE */
> > >      ovs_mutex_unlock(&dpdk_mutex);
> > >  }
> > >
> > > --
> > > 1.9.1
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > http://openvswitch.org/mailman/listinfo/dev
William Tu March 19, 2016, 1:49 a.m. UTC | #6
Hi Sugesh,

I saw many Intel NICs have flow director supports, such as Intel 10G 82599.
Instead of i40e, do you think other NICs could also benefit from this
hardware offload patch? Thanks.

Regards,
William

On Fri, Mar 18, 2016 at 8:51 AM, Chandran, Sugesh <sugesh.chandran@intel.com
> wrote:

> Hi Ben
> Thank you for looking into this.
> We are working on a generic tunneling offloading APIs and more updates
> will follow.
>
> Regards
> _Sugesh
>
>
> > -----Original Message-----
> > From: Ben Pfaff [mailto:blp@ovn.org]
> > Sent: Friday, March 18, 2016 2:53 AM
> > To: Chandran, Sugesh <sugesh.chandran@intel.com>
> > Cc: dev@openvswitch.org
> > Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance
> > using DPDK flow director feature.
> >
> > This seems really, really specific to the particular NIC.  Can you add a
> generic
> > tunnel offload interface to DPDK?  What would that look like?
> >
> > On Thu, Mar 17, 2016 at 10:43:42PM +0000, Chandran, Sugesh wrote:
> > > Hi,
> > >
> > > This patch proposes an approach that uses Flow director feature on the
> > Intel Fortville NICs to boost the VxLAN tunneling performance. In our
> testing
> > we verified that the VxLAN performance is almost doubled with this patch.
> > > The solution programs the NIC to report the flow ID along with the
> VxLAN
> > packets, and it is matched by OVS in software. There may be corner cases
> > that needs to addressed in the approach, For eg:  There is a possibility
> of race
> > condition where NIC reports flow ID that may match on different flow in
> > OVS. This happen when a rule is evicted by a new rule with same flowID+
> > hash in the OVS software. The packets may hit on wrong new rule in OVS
> > until the flow get deleted in the hardware too.
> > >
> > > It is a hardware specific implementation (Only work with Intel
> Fortville
> > NICs) for now, however the proposal works with any programmable
> > NICs.This RFC proves that the OVS can offer very high speed tunneling
> > performance using flow programmability in NICs. I am looking for
> > comments/suggestions on adding this support(such as configuring, enable
> it
> > for all the programmable NICs and etc) in OVS userspace datapath for
> > improving the performance.
> > >
> > > Regards
> > > _Sugesh
> > >
> > >
> > > > -----Original Message-----
> > > > From: Chandran, Sugesh
> > > > Sent: Thursday, March 17, 2016 10:00 PM
> > > > To: dev@openvswitch.org
> > > > Cc: Chandran, Sugesh <sugesh.chandran@intel.com>
> > > > Subject: [RFC PATCH] tunneling: Improving vxlan performance using
> > > > DPDK flow director feature.
> > > >
> > > > Optimizing vxlan tunneling performance in userspace datapath using
> > > > flow director feature in Fortville NIC DPDK ports. OVS uses metadata
> > > > reported by NIC to improve the flow lookup performance on VxLAN
> > packets.
> > > >
> > > > Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
> > > > ---
> > > >  lib/automake.mk      |   2 +
> > > >  lib/dpdk-i40e-ofld.c | 266
> > > > +++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  lib/dpdk-i40e-ofld.h |  59 ++++++++++++
> > > >  lib/dpif-netdev.c    | 118 ++++++++++++++++++++++-
> > > >  lib/netdev-dpdk.c    |  41 +++++++-
> > > >  5 files changed, 481 insertions(+), 5 deletions(-)  create mode
> > > > 100644 lib/dpdk-i40e-ofld.c  create mode 100644 lib/dpdk-i40e-ofld.h
> > > >
> > > > diff --git a/lib/automake.mk b/lib/automake.mk index
> > > > 27a1669..da48479
> > > > 100644
> > > > --- a/lib/automake.mk
> > > > +++ b/lib/automake.mk
> > > > @@ -366,6 +366,8 @@ endif
> > > >
> > > >  if DPDK_NETDEV
> > > >  lib_libopenvswitch_la_SOURCES += \
> > > > +       lib/dpdk-i40e-ofld.c \
> > > > +       lib/dpdk-i40e-ofld.h \
> > > >         lib/netdev-dpdk.c \
> > > >         lib/netdev-dpdk.h
> > > >  endif
> > > > diff --git a/lib/dpdk-i40e-ofld.c b/lib/dpdk-i40e-ofld.c new file
> > > > mode 100644 index 0000000..3ea7084
> > > > --- /dev/null
> > > > +++ b/lib/dpdk-i40e-ofld.c
> > > > @@ -0,0 +1,266 @@
> > > > +/*
> > > > + * Copyright (c) 2016 Intel Corp.
> > > > + *
> > > > + * Licensed under the Apache License, Version 2.0 (the "License");
> > > > + * you may not use this file except in compliance with the License.
> > > > + * You may obtain a copy of the License at:
> > > > + *
> > > > + *     http://www.apache.org/licenses/LICENSE-2.0
> > > > + *
> > > > + * Unless required by applicable law or agreed to in writing,
> > > > +software
> > > > + * distributed under the License is distributed on an "AS IS"
> > > > +BASIS,
> > > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
> > or
> > > > implied.
> > > > + * See the License for the specific language governing permissions
> > > > + and
> > > > + * limitations under the License.
> > > > + */
> > > > +
> > > > +#include <config.h>
> > > > +
> > > > +#include "dpdk-i40e-ofld.h"
> > > > +#include "errno.h"
> > > > +#include "ovs-thread.h"
> > > > +#include "openvswitch/vlog.h"
> > > > +#include "netdev-provider.h"
> > > > +#include "rte_pci_dev_ids.h"
> > > > +#include "rte_ethdev.h"
> > > > +
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +VLOG_DEFINE_THIS_MODULE(dpdk_hw_ofld);
> > > > +
> > > > +#define VXLAN_DST_PORT          4789
> > > > +#define VXLAN_HLEN                  50
> > > > +#define MAX_FDIR_RULES          8000
> > > > +
> > > > +static uint32_t total_fdir_ids;
> > > > +static struct ovs_mutex hw_ofld_mutex = OVS_MUTEX_INITIALIZER;
> > > > +
> > > > +/*
> > > > + * Returns '0' if FDIR IDs reaches max limit. Only 8000 entries are
> > > > + * supported in FVL.
> > > > + */
> > > > +static inline uint32_t
> > > > +i40e_fdir_entry_cnt_inc(void)
> > > > +{
> > > > +    if (total_fdir_ids < MAX_FDIR_RULES) {
> > > > +        ovs_mutex_lock(&hw_ofld_mutex);
> > > > +        total_fdir_ids++;
> > > > +        ovs_mutex_unlock(&hw_ofld_mutex);
> > > > +        return (total_fdir_ids);
> > > > +    }
> > > > +    return 0;
> > > > +}
> > > > +
> > > > +static inline void
> > > > +i40e_fdir_entry_cnt_decr(void)
> > > > +{
> > > > +    ovs_mutex_lock(&hw_ofld_mutex);
> > > > +    total_fdir_ids ? total_fdir_ids-- : 0;
> > > > +    ovs_mutex_unlock(&hw_ofld_mutex); }
> > > > +
> > > > +/*
> > > > + * Release the hardware offloading functionality from the dpdk-port.
> > > > + */
> > > > +int
> > > > +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port) {
> > > > +    ovs_mutex_lock(&hw_ofld_mutex);
> > > > +    set_i40e_ofld_flag(dpdk_port, 0);
> > > > +    ovs_mutex_unlock(&hw_ofld_mutex);
> > > > +    return 0;
> > > > +}
> > > > +
> > > > +int
> > > > +dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev,
> > > > +                                        int n_rxq, int n_txq,
> > > > +                                        struct rte_eth_conf
> > > > +*port_conf) {
> > > > +    int err = 0;
> > > > +    struct rte_eth_dev_info info;
> > > > +    uint16_t vendor_id, device_id;
> > > > +
> > > > +    rte_eth_dev_info_get(get_dpdk_port_id(dev), &info);
> > > > +    vendor_id = info.pci_dev->id.vendor_id;
> > > > +    device_id = info.pci_dev->id.device_id;
> > > > +    /* Configure vxlan offload only if its FVL NIC */
> > > > +    if (vendor_id != PCI_VENDOR_ID_INTEL || device_id !=
> > > > +                                            I40E_DEV_ID_SFP_XL710) {
> > > > +        ovs_mutex_lock(&hw_ofld_mutex);
> > > > +        set_i40e_ofld_flag(dev, 0);
> > > > +        ovs_mutex_unlock(&hw_ofld_mutex);
> > > > +        err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> > > > +                                    n_rxq, n_txq, port_conf);
> > > > +        return err;
> > > > +    }
> > > > +    ovs_mutex_lock(&hw_ofld_mutex);
> > > > +    set_i40e_ofld_flag(dev, 1);
> > > > +    ovs_mutex_unlock(&hw_ofld_mutex);
> > > > +    /* Configure FVL FDIR VxLAN tunnel handing */
> > > > +    port_conf->fdir_conf.mode = RTE_FDIR_MODE_PERFECT;
> > > > +    port_conf->fdir_conf.flex_conf.nb_payloads = 1;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].type =
> > > > RTE_ETH_L4_PAYLOAD;
> > > > +    /* Need to initilize all the 16 flex bytes,no matter;
> > > > +     * what we really using, possibly a DPDK bug?? */
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[0] = 0;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[1] = 1;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[2] = 2;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[3] = 3;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[4] = 4;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[5] = 5;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[6] = 6;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[7] = 7;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[8] = 8;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[9] = 9;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[10] = 10;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[11] = 11;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[12] = 12;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[13] = 13;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[14] = 14;
> > > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[15] = 15;
> > > > +    err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> > > > +                                n_rxq, n_txq, port_conf);
> > > > +    if (err) {
> > > > +        VLOG_ERR("Failed to configure DPDK port with hardware
> offload");
> > > > +        return err;
> > > > +    }
> > > > +    /*Clean all FDIR entries if any */
> > > > +    err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(dev),
> > > > +            RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_FLUSH, NULL);
> > > > +    return err;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Install rules for VxLAN packets in hardware  */ int
> > > > +set_up_hw_offload_port_rule(struct netdev *netdev__,
> > > > +                                const struct flow *flow,
> > > > +                                const uint32_t hw_flow_id,
> > > > +                                const bool is_add_rule) {
> > > > +    int err = 0;
> > > > +    uint8_t flexbytes[RTE_ETH_FDIR_MAX_FLEXLEN] = { 0 };
> > > > +    uint32_t *vni;
> > > > +    enum rte_filter_op filter_op;
> > > > +    struct rte_eth_fdir_filter entry = { 0 };
> > > > +    struct netdev_dpdk *netdev;
> > > > +
> > > > +    netdev = netdev_dpdk_cast(netdev__);
> > > > +    if (is_i40e_ofld_enable(netdev)) {
> > > > +        entry.soft_id = hw_flow_id;
> > > > +        if (!entry.soft_id) {
> > > > +            VLOG_DBG("Invalid flow ID, Cant install rule in the NIC
> for "
> > > > +                             "hardware offload");
> > > > +            err = ECANCELED;
> > > > +            return err;
> > > > +        }
> > > > +        /* Install rules in NIC only for VxLAN flows */
> > > > +        if (ntohs(flow->tp_dst) != VXLAN_DST_PORT) {
> > > > +            return 0;
> > > > +        }
> > > > +        entry.action.flex_off = 0;  /* use 0 by default */
> > > > +        entry.input.flow_ext.vlan_tci = 0; //! ignored by i40e fdir
> > > > +        entry.action.behavior = RTE_ETH_FDIR_PASSTHRU;
> > > > +        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID_FLEX_4;
> > > > +        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
> > > > +        entry.input.flow.ip4_flow.src_ip = flow->nw_src;
> > > > +        entry.input.flow.ip4_flow.dst_ip = flow->nw_dst;
> > > > +        entry.input.flow.udp4_flow.dst_port = htons(VXLAN_DST_PORT);
> > > > +        entry.input.flow.udp4_flow.src_port = flow->tp_src;
> > > > +        vni = (uint32_t *)&flexbytes[4];
> > > > +        *vni = flow->tunnel.tun_id << 8;
> > > > +        memcpy(entry.input.flow_ext.flexbytes, flexbytes,
> > > > +                      RTE_ETH_FDIR_MAX_FLEXLEN);
> > > > +        entry.action.rx_queue = 0;
> > > > +        filter_op = is_add_rule ? RTE_ETH_FILTER_ADD :
> > > > +                                              RTE_ETH_FILTER_DELETE;
> > > > +        err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(netdev),
> > > > +                 RTE_ETH_FILTER_FDIR, filter_op, &entry);
> > > > +
> > > > +        /*
> > > > +         * XXX : Delayed the max limit check for flow director
> entries after
> > > > +         * the configuration. Anyway the rte_eth_dev_filter_ctrl
> will fail if
> > > > +         * max limit reaches. This can be used for tracking.
> > > > +         */
> > > > +        if (is_add_rule) {
> > > > +            if (!i40e_fdir_entry_cnt_inc()) {
> > > > +                VLOG_DBG("Cant configure rule on NIC, Flow director
> "
> > > > +                        "entries hits max limit");
> > > > +            }
> > > > +        }
> > > > +        else {
> > > > +            i40e_fdir_entry_cnt_decr();
> > > > +        }
> > > > +        if (err < 0) {
> > > > +            VLOG_DBG("flow director programming error in NIC:
> (%d)\n",
> > err);
> > > > +            return err;
> > > > +        }
> > > > +    }
> > > > +    return err;
> > > > +}
> > > > +
> > > > +static int
> > > > +i40e_dpdk_port_get_hw_ofld_pkts(struct
> > > > +                 dp_netdev_pmd_thread *pmd, struct dp_packet
> > > > +                 **in_packets, struct dp_packet **hw_packets,
> > > > +                 struct dp_packet **non_hw_packets,
> > > > +                 uint32_t cnt)
> > > > +{
> > > > +    int i, hw_pkt_cnt = 0, norm_pkt_cnt = 0;
> > > > +    const struct dp_netdev_flow *flow;
> > > > +    struct rte_mbuf *mbuf;
> > > > +
> > > > +    for (i = 0; i < cnt; i++) {
> > > > +        mbuf = (struct rte_mbuf *)in_packets[i];
> > > > +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
> > > > +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf,
> > > > +
>  mbuf->hash.fdir.hi);
> > > > +            if (!flow) {
> > > > +                /* Bogus flow in hw, cannot find it in OVS EMC */
> > > > +                mbuf->ol_flags &= ~PKT_RX_FDIR_ID;
> > > > +                non_hw_packets[norm_pkt_cnt++] = in_packets[i];
> > > > +                continue;
> > > > +            }
> > > > +            dp_packet_reset_packet(in_packets[i], VXLAN_HLEN);
> > > > +            mbuf->ol_flags |= PKT_RX_RSS_HASH;
> > > > +            mbuf->hash.rss = hash_finish(mbuf->hash.rss, 1);
> > > > +            hw_packets[hw_pkt_cnt++] = in_packets[i];
> > > > +        }
> > > > +        else {
> > > > +            non_hw_packets[norm_pkt_cnt++] = in_packets[i];
> > > > +        }
> > > > +    }
> > > > +    return hw_pkt_cnt;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Process the packets based on hardware offload configuration  */
> > > > +void hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> > > > +                             struct netdev_rxq *netdev_rxq,
> > > > +                             struct dp_packet **packets, int cnt,
> > > > +                             odp_port_t port_no) {
> > > > +    int hw_pkt_cnt;
> > > > +    struct dp_packet *hw_ofld_packets[NETDEV_MAX_BURST] = { 0 };
> > > > +    struct dp_packet *orig_packets[NETDEV_MAX_BURST] = { 0 };
> > > > +    struct netdev_dpdk *netdev =
> > > > +netdev_dpdk_cast(netdev_rxq->netdev);
> > > > +
> > > > +    if (is_i40e_ofld_enable(netdev)) {
> > > > +        hw_pkt_cnt = i40e_dpdk_port_get_hw_ofld_pkts(pmd, packets,
> > > > +
> hw_ofld_packets,
> > > > +
> orig_packets, cnt);
> > > > +        /* Process packet streams separately. */
> > > > +        if (hw_pkt_cnt) {
> > > > +            dp_netdev_input(pmd, hw_ofld_packets, hw_pkt_cnt,
> port_no);
> > > > +        }
> > > > +        if (cnt - hw_pkt_cnt) {
> > > > +            dp_netdev_input(pmd, orig_packets, (cnt - hw_pkt_cnt),
> > port_no);
> > > > +        }
> > > > +    }
> > > > +    else {
> > > > +        dp_netdev_input(pmd, packets, cnt, port_no);
> > > > +    }
> > > > +}
> > > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > diff --git a/lib/dpdk-i40e-ofld.h b/lib/dpdk-i40e-ofld.h new file
> > > > mode 100644 index 0000000..1aad246
> > > > --- /dev/null
> > > > +++ b/lib/dpdk-i40e-ofld.h
> > > > @@ -0,0 +1,59 @@
> > > > +/*
> > > > + * Copyright (c) 2016 Intel Corp.
> > > > + *
> > > > + * Licensed under the Apache License, Version 2.0 (the "License");
> > > > + * you may not use this file except in compliance with the License.
> > > > + * You may obtain a copy of the License at:
> > > > + *
> > > > + *     http://www.apache.org/licenses/LICENSE-2.0
> > > > + *
> > > > + * Unless required by applicable law or agreed to in writing,
> > > > +software
> > > > + * distributed under the License is distributed on an "AS IS"
> > > > +BASIS,
> > > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
> > or
> > > > implied.
> > > > + * See the License for the specific language governing permissions
> > > > + and
> > > > + * limitations under the License.
> > > > + */
> > > > +
> > > > +#ifndef DPDK_I40E_OFLD_H_
> > > > +#define DPDK_I40E_OFLD_H_
> > > > +
> > > > +#include <config.h>
> > > > +
> > > > +#include "dp-packet.h"
> > > > +#include "netdev.h"
> > > > +
> > > > +/*
> > > > + * Macro to enable/disable HW OFFLOAD feature for DPDK.
> > > > + * 1 :- Enable HW_OFFLOAD support in OVS
> > > > + * 0 :- Disable HW_OFFLOAD support in OVS  */
> > > > +#define DPDK_I40E_TNL_OFFLOAD_ENABLE        1
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +
> > > > +struct netdev_dpdk;
> > > > +struct dp_netdev_pmd_thread;
> > > > +struct dp_netdev_flow;
> > > > +
> > > > +struct netdev_dpdk *netdev_dpdk_cast(const struct netdev *netdev);
> > > > +extern inline bool is_i40e_ofld_enable(const struct netdev_dpdk
> > > > +*netdev); extern inline void set_i40e_ofld_flag(struct netdev_dpdk
> > > > +*netdev, bool flag); extern inline int get_dpdk_port_id(struct
> > > > +netdev_dpdk *dpdk_port); int dpdk_eth_dev_hw_ofld_init(struct
> > > > netdev_dpdk *dev, int n_rxq, int n_txq,
> > > > +                              struct rte_eth_conf *port_conf); int
> > > > +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port); int
> > > > +set_up_hw_offload_port_rule(struct netdev *netdev__,
> > > > +                                const struct flow *flow,
> > > > +                                const uint32_t hw_flow_id,
> > > > +                                const bool is_add_rule); void
> > > > +hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> > > > +                             struct netdev_rxq *netdev_rxq,
> > > > +                             struct dp_packet **packets, int cnt,
> > > > +                             odp_port_t port_no); const struct
> > > > +dp_netdev_flow *lookup_hw_offload_flow_for_fdirid(
> > > > +                            const struct dp_netdev_pmd_thread *pmd,
> > > > +                            struct rte_mbuf *mbuf, uint32_t
> > > > +flow_id); void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > > > +struct dp_packet
> > > > **,
> > > > +                     int cnt, odp_port_t port_no);
> > > > +
> > > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE #endif /*
> > DPDK_I40E_OFLD_H_
> > > > +*/
> > > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> > > > cf574ad..d79b239
> > > > 100644
> > > > --- a/lib/dpif-netdev.c
> > > > +++ b/lib/dpif-netdev.c
> > > > @@ -70,6 +70,7 @@
> > > >  #include "util.h"
> > > >
> > > >  #include "openvswitch/vlog.h"
> > > > +#include "dpdk-i40e-ofld.h"
> > > >
> > > >  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> > > >
> > > > @@ -478,7 +479,7 @@ static void dp_netdev_execute_actions(struct
> > > > dp_netdev_pmd_thread *pmd,
> > > >                                        bool may_steal,
> > > >                                        const struct nlattr *actions,
> > > >                                        size_t actions_len); -static
> > > > void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > > > +void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > > >                              struct dp_packet **, int cnt,
> > > > odp_port_t port_no);  static void dp_netdev_recirculate(struct
> > dp_netdev_pmd_thread *,
> > > >                                    struct dp_packet **, int cnt); @@
> > > > -1455,6 +1456,28 @@ dp_netdev_pmd_remove_flow(struct
> > dp_netdev_pmd_thread *pmd,
> > > >      flow->dead = true;
> > > >
> > > >      dp_netdev_flow_unref(flow);
> > > > +
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +    struct dp_netdev_port *dp_port;
> > > > +    int err;
> > > > +    odp_port_t in_port = flow->flow.in_port.odp_port;
> > > > +    err = get_port_by_number(pmd->dp, in_port, &dp_port);
> > > > +    if (err) {
> > > > +        VLOG_WARN("Cannot get the port information, hardware
> > > > +offload may
> > > > "
> > > > +                "not be functional");
> > > > +        return;
> > > > +    }
> > > > +    if(strcmp(dp_port->type, "dpdk")) {
> > > > +        /* No hardware offload on a non-DPDK port") */
> > > > +        return;
> > > > +    }
> > > > +    /* Remove the hardware offload rule if exists.*/
> > > > +    if(set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
> > > > +            dp_netdev_flow_hash(&(flow->ufid)), 0)) {
> > > > +        VLOG_DBG("Failed to delete the hardware offload rule");
> > > > +        return;
> > > > +    }
> > > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > >  }
> > > >
> > > >  static void
> > > > @@ -2059,6 +2082,32 @@ dp_netdev_flow_add(struct
> > > > dp_netdev_pmd_thread *pmd,
> > > >          ds_destroy(&ds);
> > > >      }
> > > >
> > > > +    /*
> > > > +     * Configure the hardware offload for tunnel while flows are
> getting
> > > > +     * inserted in OVS.
> > > > +     */
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +    struct dp_netdev_port *dp_port;
> > > > +    int err;
> > > > +    odp_port_t in_port = flow->flow.in_port.odp_port;
> > > > +    err = get_port_by_number(pmd->dp, in_port, &dp_port);
> > > > +    if (err) {
> > > > +        VLOG_WARN("Cannot get the port information, Failed to
> configure
> > "
> > > > +                            "hardware offload");
> > > > +        goto out;
> > > > +    }
> > > > +    if (strcmp(dp_port->type, "dpdk")) {
> > > > +        /* No hardware offload on a non-DPDK port */
> > > > +        goto out;
> > > > +    }
> > > > +    /* install the rule in hw, reduntant might overwrite if it
> exists*/
> > > > +    if (set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
> > > > +            dp_netdev_flow_hash(&flow->ufid), 1)) {
> > > > +        VLOG_ERR("Failed to install the hardware offload rule");
> > > > +        goto out;
> > > > +    }
> > > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +out:
> > > >      return flow;
> > > >  }
> > > >
> > > > @@ -2575,7 +2624,19 @@ dp_netdev_process_rxq_port(struct
> > > > dp_netdev_pmd_thread *pmd,
> > > >          *recirc_depth_get() = 0;
> > > >
> > > >          cycles_count_start(pmd);
> > > > +
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +        /* Check if the source port is DPDK */
> > > > +        if (packets[0]->source == DPBUF_DPDK) {
> > > > +            hw_ofld_dp_netdev_input(pmd, rxq, packets, cnt, port-
> > >port_no);
> > > > +        }
> > > > +        else {
> > > > +            dp_netdev_input(pmd, packets, cnt, port->port_no);
> > > > +        }
> > > > +#else
> > > >          dp_netdev_input(pmd, packets, cnt, port->port_no);
> > > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +
> > > >          cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
> > > >      } else if (error != EAGAIN && error != EOPNOTSUPP) {
> > > >          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1,
> > > > 5); @@ -
> > > > 3321,7 +3382,6 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread
> > *pmd,
> > > > struct dp_packet *packet_,
> > > >          flow->tunnel.metadata.present.len =
> > > > orig_tunnel.metadata.present.len;
> > > >          flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
> > > >      }
> > > > -
> > > >      return err;
> > > >  }
> > > >
> > > > @@ -3430,6 +3490,7 @@ emc_processing(struct
> > dp_netdev_pmd_thread
> > > > *pmd, struct dp_packet **packets,
> > > >      struct emc_cache *flow_cache = &pmd->flow_cache;
> > > >      struct netdev_flow_key *key = &keys[0];
> > > >      size_t i, n_missed = 0, n_dropped = 0;
> > > > +    struct rte_mbuf *mbuf;
> > > >
> > > >      for (i = 0; i < cnt; i++) {
> > > >          struct dp_netdev_flow *flow; @@ -3454,7 +3515,18 @@
> > > > emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet
> > > > **packets,
> > > >          key->len = 0; /* Not computed yet. */
> > > >          key->hash = dpif_netdev_packet_get_rss_hash(packet,
> > > > &key->mf);
> > > >
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +        mbuf = (struct rte_mbuf *)packet;
> > > > +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
> > > > +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf, 0);
> > > > +        }
> > > > +        else {
> > > > +            flow = emc_lookup(flow_cache, key);
> > > > +        }
> > > > +#else
> > > >          flow = emc_lookup(flow_cache, key);
> > > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +
> > > >          if (OVS_LIKELY(flow)) {
> > > >              dp_netdev_queue_batches(packet, flow, &key->mf, batches,
> > > >                                      n_batches); @@ -3651,7 +3723,7
> > > > @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
> > > >      }
> > > >  }
> > > >
> > > > -static void
> > > > +void
> > > >  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> > > >                  struct dp_packet **packets, int cnt,
> > > >                  odp_port_t port_no) @@ -4290,3 +4362,43 @@
> > > > dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key
> > > > keys[],
> > > >      }
> > > >      return false;                     /* Some misses. */
> > > >  }
> > > > +
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +/*
> > > > + * EMC lookup function on 'flow id' reported by NIC.
> > > > + */
> > > > +const struct dp_netdev_flow *
> > > > +lookup_hw_offload_flow_for_fdirid(const struct
> > > > +                 dp_netdev_pmd_thread *pmd, struct rte_mbuf *mbuf,
> > > > +                 uint32_t flow_id)
> > > > +{
> > > > +    const struct emc_cache *flow_cache = &pmd->flow_cache;
> > > > +    struct netdev_flow_key key;
> > > > +    struct emc_entry *current_entry;
> > > > +
> > > > +    key.len = 0;
> > > > +    if (OVS_LIKELY(mbuf->ol_flags & PKT_RX_RSS_HASH)) {
> > > > +        key.hash = mbuf->hash.rss;
> > > > +    }
> > > > +    else {
> > > > +        return NULL;
> > > > +    }
> > > > +    EMC_FOR_EACH_POS_WITH_HASH(flow_cache, current_entry,
> > > > key.hash) {
> > > > +        if (current_entry->key.hash == key.hash
> > > > +            && emc_entry_alive(current_entry)) {
> > > > +            if (OVS_UNLIKELY(flow_id && dp_netdev_flow_hash(
> > > > +                                       &current_entry->flow->ufid)
> !=
> > > > +                                       flow_id)) {
> > > > +                /* Hash collision in emc, fallback to software path
> */
> > > > +                return NULL;
> > > > +            }
> > > > +            return current_entry->flow;
> > > > +        }
> > > > +    }
> > > > +    /* XXX :: An improved classifier lookup needed here without any
> > miniflow
> > > > +     * extract to keep it performant.Until then fallback to
> software based
> > > > +     * packet forwarding on EMC miss.
> > > > +     */
> > > > +     return NULL;
> > > > +}
> > > > +#endif /* DPDK_I40E_TNL_OFFLOAD_ENABLE */
> > > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> > > > f402354..2954f83
> > > > 100644
> > > > --- a/lib/netdev-dpdk.c
> > > > +++ b/lib/netdev-dpdk.c
> > > > @@ -56,6 +56,7 @@
> > > >  #include "rte_mbuf.h"
> > > >  #include "rte_meter.h"
> > > >  #include "rte_virtio_net.h"
> > > > +#include "dpdk-i40e-ofld.h"
> > > >
> > > >  VLOG_DEFINE_THIS_MODULE(dpdk);
> > > >  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> > > > @@ -112,7 +113,7 @@ static char *vhost_sock_dir = NULL;   /* Location
> > of
> > > > vhost-user sockets */
> > > >   */
> > > >  #define VHOST_ENQ_RETRY_USECS 100
> > > >
> > > > -static const struct rte_eth_conf port_conf = {
> > > > +static struct rte_eth_conf port_conf = {
> > > >      .rxmode = {
> > > >          .mq_mode = ETH_MQ_RX_RSS,
> > > >          .split_hdr_size = 0,
> > > > @@ -331,6 +332,9 @@ struct netdev_dpdk {
> > > >
> > > >      /* Identifier used to distinguish vhost devices from each other
> */
> > > >      char vhost_id[PATH_MAX];
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +    bool i40e_ofld_enable; /* hardware/NIC offload flag*/ #endif
> > > > +//DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > >
> > > >      /* In dpdk_list. */
> > > >      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -346,6
> > > > +350,24 @@ struct netdev_rxq_dpdk {
> > > >      int port_id;
> > > >  };
> > > >
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE inline bool
> > > > +is_i40e_ofld_enable(const struct netdev_dpdk *netdev) {
> > > > +    return netdev->i40e_ofld_enable; }
> > > > +
> > > > +inline void set_i40e_ofld_flag(struct netdev_dpdk *netdev,
> > > > +                                                bool flag) {
> > > > +    netdev->i40e_ofld_enable = flag; }
> > > > +
> > > > +inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port) {
> > > > +    return dpdk_port->port_id;
> > > > +}
> > > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +
> > > >  static bool dpdk_thread_is_pmd(void);
> > > >
> > > >  static int netdev_dpdk_construct(struct netdev *); @@ -539,10
> > > > +561,21 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
> > > > n_rxq, int
> > > > n_txq)
> > > >              VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq,
> n_txq);
> > > >          }
> > > >
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +        diag = (!dev->i40e_ofld_enable && dev->type == DPDK_DEV_ETH)
> > ?
> > > > +                    dpdk_eth_dev_hw_ofld_init(dev, n_rxq, n_txq,
> &port_conf)
> > :
> > > > +                    rte_eth_dev_configure(dev->port_id,
> > > > +                    n_rxq, n_txq, &port_conf);
> > > > +        if (diag) {
> > > > +            /* rte_dev_configure error */
> > > > +            break;
> > > > +        }
> > > > +#else
> > > >          diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,
> > &port_conf);
> > > >          if (diag) {
> > > >              break;
> > > >          }
> > > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > >
> > > >          for (i = 0; i < n_txq; i++) {
> > > >              diag = rte_eth_tx_queue_setup(dev->port_id, i,
> > > > NIC_PORT_TX_Q_SIZE, @@ -637,7 +670,7 @@ dpdk_eth_dev_init(struct
> > > > netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
> > > >      return 0;
> > > >  }
> > > >
> > > > -static struct netdev_dpdk *
> > > > +struct netdev_dpdk *
> > > >  netdev_dpdk_cast(const struct netdev *netdev)  {
> > > >      return CONTAINER_OF(netdev, struct netdev_dpdk, up); @@ -861,6
> > > > +894,10 @@ netdev_dpdk_destruct(struct netdev *netdev_)
> > > >      rte_free(dev->tx_q);
> > > >      list_remove(&dev->list_node);
> > > >      dpdk_mp_put(dev->dpdk_mp);
> > > > +
> > > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> > > > +        dpdk_hw_ofld_port_release(dev); #endif /*
> > > > +DPDK_I40E_TNL_OFFLOAD_ENABLE */
> > > >      ovs_mutex_unlock(&dpdk_mutex);
> > > >  }
> > > >
> > > > --
> > > > 1.9.1
> > >
> > > _______________________________________________
> > > dev mailing list
> > > dev@openvswitch.org
> > > http://openvswitch.org/mailman/listinfo/dev
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>
Chandran, Sugesh March 22, 2016, 6:06 p.m. UTC | #7
Hi William,
All the NICs with flow director cannot benefit from the patch due to the flow matching limitation in the NIC.
The offload patch programs the NIC to match on tunnel + header fields (L2 + L3 + L4 + Tunnel header). The XL710 supports filtering on any flexible 16 byte field after the L4 header that used to match the VxLAN header fields such as tenant ID.  However the 10G- 82599  NIC supports only 2 Byte flex field which is not enough to match on tunnel headers.


Regards
_Sugesh

From: William Tu [mailto:u9012063@gmail.com]

Sent: Saturday, March 19, 2016 1:50 AM
To: Chandran, Sugesh <sugesh.chandran@intel.com>
Cc: Ben Pfaff <blp@ovn.org>; dev@openvswitch.org
Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance using DPDK flow director feature.

Hi Sugesh,
I saw many Intel NICs have flow director supports, such as Intel 10G 82599. Instead of i40e, do you think other NICs could also benefit from this hardware offload patch? Thanks.
Regards,
William

On Fri, Mar 18, 2016 at 8:51 AM, Chandran, Sugesh <sugesh.chandran@intel.com<mailto:sugesh.chandran@intel.com>> wrote:
Hi Ben
Thank you for looking into this.
We are working on a generic tunneling offloading APIs and more updates will follow.

Regards
_Sugesh


> -----Original Message-----

> From: Ben Pfaff [mailto:blp@ovn.org<mailto:blp@ovn.org>]

> Sent: Friday, March 18, 2016 2:53 AM

> To: Chandran, Sugesh <sugesh.chandran@intel.com<mailto:sugesh.chandran@intel.com>>

> Cc: dev@openvswitch.org<mailto:dev@openvswitch.org>

> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance

> using DPDK flow director feature.

>

> This seems really, really specific to the particular NIC.  Can you add a generic

> tunnel offload interface to DPDK?  What would that look like?

>

> On Thu, Mar 17, 2016 at 10:43:42PM +0000, Chandran, Sugesh wrote:

> > Hi,

> >

> > This patch proposes an approach that uses Flow director feature on the

> Intel Fortville NICs to boost the VxLAN tunneling performance. In our testing

> we verified that the VxLAN performance is almost doubled with this patch.

> > The solution programs the NIC to report the flow ID along with the VxLAN

> packets, and it is matched by OVS in software. There may be corner cases

> that needs to addressed in the approach, For eg:  There is a possibility of race

> condition where NIC reports flow ID that may match on different flow in

> OVS. This happen when a rule is evicted by a new rule with same flowID+

> hash in the OVS software. The packets may hit on wrong new rule in OVS

> until the flow get deleted in the hardware too.

> >

> > It is a hardware specific implementation (Only work with Intel Fortville

> NICs) for now, however the proposal works with any programmable

> NICs.This RFC proves that the OVS can offer very high speed tunneling

> performance using flow programmability in NICs. I am looking for

> comments/suggestions on adding this support(such as configuring, enable it

> for all the programmable NICs and etc) in OVS userspace datapath for

> improving the performance.

> >

> > Regards

> > _Sugesh

> >

> >

> > > -----Original Message-----

> > > From: Chandran, Sugesh

> > > Sent: Thursday, March 17, 2016 10:00 PM

> > > To: dev@openvswitch.org<mailto:dev@openvswitch.org>

> > > Cc: Chandran, Sugesh <sugesh.chandran@intel.com<mailto:sugesh.chandran@intel.com>>

> > > Subject: [RFC PATCH] tunneling: Improving vxlan performance using

> > > DPDK flow director feature.

> > >

> > > Optimizing vxlan tunneling performance in userspace datapath using

> > > flow director feature in Fortville NIC DPDK ports. OVS uses metadata

> > > reported by NIC to improve the flow lookup performance on VxLAN

> packets.

> > >

> > > Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com<mailto:sugesh.chandran@intel.com>>

> > > ---

> > >  lib/automake.mk<http://automake.mk>      |   2 +

> > >  lib/dpdk-i40e-ofld.c | 266

> > > +++++++++++++++++++++++++++++++++++++++++++++++++++

> > >  lib/dpdk-i40e-ofld.h |  59 ++++++++++++

> > >  lib/dpif-netdev.c    | 118 ++++++++++++++++++++++-

> > >  lib/netdev-dpdk.c    |  41 +++++++-

> > >  5 files changed, 481 insertions(+), 5 deletions(-)  create mode

> > > 100644 lib/dpdk-i40e-ofld.c  create mode 100644 lib/dpdk-i40e-ofld.h

> > >

> > > diff --git a/lib/automake.mk<http://automake.mk> b/lib/automake.mk<http://automake.mk> index

> > > 27a1669..da48479

> > > 100644

> > > --- a/lib/automake.mk<http://automake.mk>

> > > +++ b/lib/automake.mk<http://automake.mk>

> > > @@ -366,6 +366,8 @@ endif

> > >

> > >  if DPDK_NETDEV

> > >  lib_libopenvswitch_la_SOURCES += \

> > > +       lib/dpdk-i40e-ofld.c \

> > > +       lib/dpdk-i40e-ofld.h \

> > >         lib/netdev-dpdk.c \

> > >         lib/netdev-dpdk.h

> > >  endif

> > > diff --git a/lib/dpdk-i40e-ofld.c b/lib/dpdk-i40e-ofld.c new file

> > > mode 100644 index 0000000..3ea7084

> > > --- /dev/null

> > > +++ b/lib/dpdk-i40e-ofld.c

> > > @@ -0,0 +1,266 @@

> > > +/*

> > > + * Copyright (c) 2016 Intel Corp.

> > > + *

> > > + * Licensed under the Apache License, Version 2.0 (the "License");

> > > + * you may not use this file except in compliance with the License.

> > > + * You may obtain a copy of the License at:

> > > + *

> > > + *     http://www.apache.org/licenses/LICENSE-2.0

> > > + *

> > > + * Unless required by applicable law or agreed to in writing,

> > > +software

> > > + * distributed under the License is distributed on an "AS IS"

> > > +BASIS,

> > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express

> or

> > > implied.

> > > + * See the License for the specific language governing permissions

> > > + and

> > > + * limitations under the License.

> > > + */

> > > +

> > > +#include <config.h>

> > > +

> > > +#include "dpdk-i40e-ofld.h"

> > > +#include "errno.h"

> > > +#include "ovs-thread.h"

> > > +#include "openvswitch/vlog.h"

> > > +#include "netdev-provider.h"

> > > +#include "rte_pci_dev_ids.h"

> > > +#include "rte_ethdev.h"

> > > +

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +VLOG_DEFINE_THIS_MODULE(dpdk_hw_ofld);

> > > +

> > > +#define VXLAN_DST_PORT          4789

> > > +#define VXLAN_HLEN                  50

> > > +#define MAX_FDIR_RULES          8000

> > > +

> > > +static uint32_t total_fdir_ids;

> > > +static struct ovs_mutex hw_ofld_mutex = OVS_MUTEX_INITIALIZER;

> > > +

> > > +/*

> > > + * Returns '0' if FDIR IDs reaches max limit. Only 8000 entries are

> > > + * supported in FVL.

> > > + */

> > > +static inline uint32_t

> > > +i40e_fdir_entry_cnt_inc(void)

> > > +{

> > > +    if (total_fdir_ids < MAX_FDIR_RULES) {

> > > +        ovs_mutex_lock(&hw_ofld_mutex);

> > > +        total_fdir_ids++;

> > > +        ovs_mutex_unlock(&hw_ofld_mutex);

> > > +        return (total_fdir_ids);

> > > +    }

> > > +    return 0;

> > > +}

> > > +

> > > +static inline void

> > > +i40e_fdir_entry_cnt_decr(void)

> > > +{

> > > +    ovs_mutex_lock(&hw_ofld_mutex);

> > > +    total_fdir_ids ? total_fdir_ids-- : 0;

> > > +    ovs_mutex_unlock(&hw_ofld_mutex); }

> > > +

> > > +/*

> > > + * Release the hardware offloading functionality from the dpdk-port.

> > > + */

> > > +int

> > > +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port) {

> > > +    ovs_mutex_lock(&hw_ofld_mutex);

> > > +    set_i40e_ofld_flag(dpdk_port, 0);

> > > +    ovs_mutex_unlock(&hw_ofld_mutex);

> > > +    return 0;

> > > +}

> > > +

> > > +int

> > > +dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev,

> > > +                                        int n_rxq, int n_txq,

> > > +                                        struct rte_eth_conf

> > > +*port_conf) {

> > > +    int err = 0;

> > > +    struct rte_eth_dev_info info;

> > > +    uint16_t vendor_id, device_id;

> > > +

> > > +    rte_eth_dev_info_get(get_dpdk_port_id(dev), &info);

> > > +    vendor_id = info.pci_dev->id.vendor_id;

> > > +    device_id = info.pci_dev->id.device_id;

> > > +    /* Configure vxlan offload only if its FVL NIC */

> > > +    if (vendor_id != PCI_VENDOR_ID_INTEL || device_id !=

> > > +                                            I40E_DEV_ID_SFP_XL710) {

> > > +        ovs_mutex_lock(&hw_ofld_mutex);

> > > +        set_i40e_ofld_flag(dev, 0);

> > > +        ovs_mutex_unlock(&hw_ofld_mutex);

> > > +        err = rte_eth_dev_configure(get_dpdk_port_id(dev),

> > > +                                    n_rxq, n_txq, port_conf);

> > > +        return err;

> > > +    }

> > > +    ovs_mutex_lock(&hw_ofld_mutex);

> > > +    set_i40e_ofld_flag(dev, 1);

> > > +    ovs_mutex_unlock(&hw_ofld_mutex);

> > > +    /* Configure FVL FDIR VxLAN tunnel handing */

> > > +    port_conf->fdir_conf.mode = RTE_FDIR_MODE_PERFECT;

> > > +    port_conf->fdir_conf.flex_conf.nb_payloads = 1;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].type =

> > > RTE_ETH_L4_PAYLOAD;

> > > +    /* Need to initilize all the 16 flex bytes,no matter;

> > > +     * what we really using, possibly a DPDK bug?? */

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[0] = 0;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[1] = 1;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[2] = 2;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[3] = 3;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[4] = 4;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[5] = 5;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[6] = 6;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[7] = 7;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[8] = 8;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[9] = 9;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[10] = 10;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[11] = 11;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[12] = 12;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[13] = 13;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[14] = 14;

> > > +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[15] = 15;

> > > +    err = rte_eth_dev_configure(get_dpdk_port_id(dev),

> > > +                                n_rxq, n_txq, port_conf);

> > > +    if (err) {

> > > +        VLOG_ERR("Failed to configure DPDK port with hardware offload");

> > > +        return err;

> > > +    }

> > > +    /*Clean all FDIR entries if any */

> > > +    err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(dev),

> > > +            RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_FLUSH, NULL);

> > > +    return err;

> > > +}

> > > +

> > > +/*

> > > + * Install rules for VxLAN packets in hardware  */ int

> > > +set_up_hw_offload_port_rule(struct netdev *netdev__,

> > > +                                const struct flow *flow,

> > > +                                const uint32_t hw_flow_id,

> > > +                                const bool is_add_rule) {

> > > +    int err = 0;

> > > +    uint8_t flexbytes[RTE_ETH_FDIR_MAX_FLEXLEN] = { 0 };

> > > +    uint32_t *vni;

> > > +    enum rte_filter_op filter_op;

> > > +    struct rte_eth_fdir_filter entry = { 0 };

> > > +    struct netdev_dpdk *netdev;

> > > +

> > > +    netdev = netdev_dpdk_cast(netdev__);

> > > +    if (is_i40e_ofld_enable(netdev)) {

> > > +        entry.soft_id = hw_flow_id;

> > > +        if (!entry.soft_id) {

> > > +            VLOG_DBG("Invalid flow ID, Cant install rule in the NIC for "

> > > +                             "hardware offload");

> > > +            err = ECANCELED;

> > > +            return err;

> > > +        }

> > > +        /* Install rules in NIC only for VxLAN flows */

> > > +        if (ntohs(flow->tp_dst) != VXLAN_DST_PORT) {

> > > +            return 0;

> > > +        }

> > > +        entry.action.flex_off = 0;  /* use 0 by default */

> > > +        entry.input.flow_ext.vlan_tci = 0; //! ignored by i40e fdir

> > > +        entry.action.behavior = RTE_ETH_FDIR_PASSTHRU;

> > > +        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID_FLEX_4;

> > > +        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;

> > > +        entry.input.flow.ip4_flow.src_ip = flow->nw_src;

> > > +        entry.input.flow.ip4_flow.dst_ip = flow->nw_dst;

> > > +        entry.input.flow.udp4_flow.dst_port = htons(VXLAN_DST_PORT);

> > > +        entry.input.flow.udp4_flow.src_port = flow->tp_src;

> > > +        vni = (uint32_t *)&flexbytes[4];

> > > +        *vni = flow->tunnel.tun_id << 8;

> > > +        memcpy(entry.input.flow_ext.flexbytes, flexbytes,

> > > +                      RTE_ETH_FDIR_MAX_FLEXLEN);

> > > +        entry.action.rx_queue = 0;

> > > +        filter_op = is_add_rule ? RTE_ETH_FILTER_ADD :

> > > +                                              RTE_ETH_FILTER_DELETE;

> > > +        err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(netdev),

> > > +                 RTE_ETH_FILTER_FDIR, filter_op, &entry);

> > > +

> > > +        /*

> > > +         * XXX : Delayed the max limit check for flow director entries after

> > > +         * the configuration. Anyway the rte_eth_dev_filter_ctrl will fail if

> > > +         * max limit reaches. This can be used for tracking.

> > > +         */

> > > +        if (is_add_rule) {

> > > +            if (!i40e_fdir_entry_cnt_inc()) {

> > > +                VLOG_DBG("Cant configure rule on NIC, Flow director "

> > > +                        "entries hits max limit");

> > > +            }

> > > +        }

> > > +        else {

> > > +            i40e_fdir_entry_cnt_decr();

> > > +        }

> > > +        if (err < 0) {

> > > +            VLOG_DBG("flow director programming error in NIC: (%d)\n",

> err);

> > > +            return err;

> > > +        }

> > > +    }

> > > +    return err;

> > > +}

> > > +

> > > +static int

> > > +i40e_dpdk_port_get_hw_ofld_pkts(struct

> > > +                 dp_netdev_pmd_thread *pmd, struct dp_packet

> > > +                 **in_packets, struct dp_packet **hw_packets,

> > > +                 struct dp_packet **non_hw_packets,

> > > +                 uint32_t cnt)

> > > +{

> > > +    int i, hw_pkt_cnt = 0, norm_pkt_cnt = 0;

> > > +    const struct dp_netdev_flow *flow;

> > > +    struct rte_mbuf *mbuf;

> > > +

> > > +    for (i = 0; i < cnt; i++) {

> > > +        mbuf = (struct rte_mbuf *)in_packets[i];

> > > +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {

> > > +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf,

> > > +                                                     mbuf->hash.fdir.hi);

> > > +            if (!flow) {

> > > +                /* Bogus flow in hw, cannot find it in OVS EMC */

> > > +                mbuf->ol_flags &= ~PKT_RX_FDIR_ID;

> > > +                non_hw_packets[norm_pkt_cnt++] = in_packets[i];

> > > +                continue;

> > > +            }

> > > +            dp_packet_reset_packet(in_packets[i], VXLAN_HLEN);

> > > +            mbuf->ol_flags |= PKT_RX_RSS_HASH;

> > > +            mbuf->hash.rss = hash_finish(mbuf->hash.rss, 1);

> > > +            hw_packets[hw_pkt_cnt++] = in_packets[i];

> > > +        }

> > > +        else {

> > > +            non_hw_packets[norm_pkt_cnt++] = in_packets[i];

> > > +        }

> > > +    }

> > > +    return hw_pkt_cnt;

> > > +}

> > > +

> > > +/*

> > > + * Process the packets based on hardware offload configuration  */

> > > +void hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,

> > > +                             struct netdev_rxq *netdev_rxq,

> > > +                             struct dp_packet **packets, int cnt,

> > > +                             odp_port_t port_no) {

> > > +    int hw_pkt_cnt;

> > > +    struct dp_packet *hw_ofld_packets[NETDEV_MAX_BURST] = { 0 };

> > > +    struct dp_packet *orig_packets[NETDEV_MAX_BURST] = { 0 };

> > > +    struct netdev_dpdk *netdev =

> > > +netdev_dpdk_cast(netdev_rxq->netdev);

> > > +

> > > +    if (is_i40e_ofld_enable(netdev)) {

> > > +        hw_pkt_cnt = i40e_dpdk_port_get_hw_ofld_pkts(pmd, packets,

> > > +                                                          hw_ofld_packets,

> > > +                                                          orig_packets, cnt);

> > > +        /* Process packet streams separately. */

> > > +        if (hw_pkt_cnt) {

> > > +            dp_netdev_input(pmd, hw_ofld_packets, hw_pkt_cnt, port_no);

> > > +        }

> > > +        if (cnt - hw_pkt_cnt) {

> > > +            dp_netdev_input(pmd, orig_packets, (cnt - hw_pkt_cnt),

> port_no);

> > > +        }

> > > +    }

> > > +    else {

> > > +        dp_netdev_input(pmd, packets, cnt, port_no);

> > > +    }

> > > +}

> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > diff --git a/lib/dpdk-i40e-ofld.h b/lib/dpdk-i40e-ofld.h new file

> > > mode 100644 index 0000000..1aad246

> > > --- /dev/null

> > > +++ b/lib/dpdk-i40e-ofld.h

> > > @@ -0,0 +1,59 @@

> > > +/*

> > > + * Copyright (c) 2016 Intel Corp.

> > > + *

> > > + * Licensed under the Apache License, Version 2.0 (the "License");

> > > + * you may not use this file except in compliance with the License.

> > > + * You may obtain a copy of the License at:

> > > + *

> > > + *     http://www.apache.org/licenses/LICENSE-2.0

> > > + *

> > > + * Unless required by applicable law or agreed to in writing,

> > > +software

> > > + * distributed under the License is distributed on an "AS IS"

> > > +BASIS,

> > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express

> or

> > > implied.

> > > + * See the License for the specific language governing permissions

> > > + and

> > > + * limitations under the License.

> > > + */

> > > +

> > > +#ifndef DPDK_I40E_OFLD_H_

> > > +#define DPDK_I40E_OFLD_H_

> > > +

> > > +#include <config.h>

> > > +

> > > +#include "dp-packet.h"

> > > +#include "netdev.h"

> > > +

> > > +/*

> > > + * Macro to enable/disable HW OFFLOAD feature for DPDK.

> > > + * 1 :- Enable HW_OFFLOAD support in OVS

> > > + * 0 :- Disable HW_OFFLOAD support in OVS  */

> > > +#define DPDK_I40E_TNL_OFFLOAD_ENABLE        1

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +

> > > +struct netdev_dpdk;

> > > +struct dp_netdev_pmd_thread;

> > > +struct dp_netdev_flow;

> > > +

> > > +struct netdev_dpdk *netdev_dpdk_cast(const struct netdev *netdev);

> > > +extern inline bool is_i40e_ofld_enable(const struct netdev_dpdk

> > > +*netdev); extern inline void set_i40e_ofld_flag(struct netdev_dpdk

> > > +*netdev, bool flag); extern inline int get_dpdk_port_id(struct

> > > +netdev_dpdk *dpdk_port); int dpdk_eth_dev_hw_ofld_init(struct

> > > netdev_dpdk *dev, int n_rxq, int n_txq,

> > > +                              struct rte_eth_conf *port_conf); int

> > > +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port); int

> > > +set_up_hw_offload_port_rule(struct netdev *netdev__,

> > > +                                const struct flow *flow,

> > > +                                const uint32_t hw_flow_id,

> > > +                                const bool is_add_rule); void

> > > +hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,

> > > +                             struct netdev_rxq *netdev_rxq,

> > > +                             struct dp_packet **packets, int cnt,

> > > +                             odp_port_t port_no); const struct

> > > +dp_netdev_flow *lookup_hw_offload_flow_for_fdirid(

> > > +                            const struct dp_netdev_pmd_thread *pmd,

> > > +                            struct rte_mbuf *mbuf, uint32_t

> > > +flow_id); void dp_netdev_input(struct dp_netdev_pmd_thread *,

> > > +struct dp_packet

> > > **,

> > > +                     int cnt, odp_port_t port_no);

> > > +

> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE #endif /*

> DPDK_I40E_OFLD_H_

> > > +*/

> > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index

> > > cf574ad..d79b239

> > > 100644

> > > --- a/lib/dpif-netdev.c

> > > +++ b/lib/dpif-netdev.c

> > > @@ -70,6 +70,7 @@

> > >  #include "util.h"

> > >

> > >  #include "openvswitch/vlog.h"

> > > +#include "dpdk-i40e-ofld.h"

> > >

> > >  VLOG_DEFINE_THIS_MODULE(dpif_netdev);

> > >

> > > @@ -478,7 +479,7 @@ static void dp_netdev_execute_actions(struct

> > > dp_netdev_pmd_thread *pmd,

> > >                                        bool may_steal,

> > >                                        const struct nlattr *actions,

> > >                                        size_t actions_len); -static

> > > void dp_netdev_input(struct dp_netdev_pmd_thread *,

> > > +void dp_netdev_input(struct dp_netdev_pmd_thread *,

> > >                              struct dp_packet **, int cnt,

> > > odp_port_t port_no);  static void dp_netdev_recirculate(struct

> dp_netdev_pmd_thread *,

> > >                                    struct dp_packet **, int cnt); @@

> > > -1455,6 +1456,28 @@ dp_netdev_pmd_remove_flow(struct

> dp_netdev_pmd_thread *pmd,

> > >      flow->dead = true;

> > >

> > >      dp_netdev_flow_unref(flow);

> > > +

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +    struct dp_netdev_port *dp_port;

> > > +    int err;

> > > +    odp_port_t in_port = flow->flow.in_port.odp_port;

> > > +    err = get_port_by_number(pmd->dp, in_port, &dp_port);

> > > +    if (err) {

> > > +        VLOG_WARN("Cannot get the port information, hardware

> > > +offload may

> > > "

> > > +                "not be functional");

> > > +        return;

> > > +    }

> > > +    if(strcmp(dp_port->type, "dpdk")) {

> > > +        /* No hardware offload on a non-DPDK port") */

> > > +        return;

> > > +    }

> > > +    /* Remove the hardware offload rule if exists.*/

> > > +    if(set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,

> > > +            dp_netdev_flow_hash(&(flow->ufid)), 0)) {

> > > +        VLOG_DBG("Failed to delete the hardware offload rule");

> > > +        return;

> > > +    }

> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE

> > >  }

> > >

> > >  static void

> > > @@ -2059,6 +2082,32 @@ dp_netdev_flow_add(struct

> > > dp_netdev_pmd_thread *pmd,

> > >          ds_destroy(&ds);

> > >      }

> > >

> > > +    /*

> > > +     * Configure the hardware offload for tunnel while flows are getting

> > > +     * inserted in OVS.

> > > +     */

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +    struct dp_netdev_port *dp_port;

> > > +    int err;

> > > +    odp_port_t in_port = flow->flow.in_port.odp_port;

> > > +    err = get_port_by_number(pmd->dp, in_port, &dp_port);

> > > +    if (err) {

> > > +        VLOG_WARN("Cannot get the port information, Failed to configure

> "

> > > +                            "hardware offload");

> > > +        goto out;

> > > +    }

> > > +    if (strcmp(dp_port->type, "dpdk")) {

> > > +        /* No hardware offload on a non-DPDK port */

> > > +        goto out;

> > > +    }

> > > +    /* install the rule in hw, reduntant might overwrite if it exists*/

> > > +    if (set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,

> > > +            dp_netdev_flow_hash(&flow->ufid), 1)) {

> > > +        VLOG_ERR("Failed to install the hardware offload rule");

> > > +        goto out;

> > > +    }

> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +out:

> > >      return flow;

> > >  }

> > >

> > > @@ -2575,7 +2624,19 @@ dp_netdev_process_rxq_port(struct

> > > dp_netdev_pmd_thread *pmd,

> > >          *recirc_depth_get() = 0;

> > >

> > >          cycles_count_start(pmd);

> > > +

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +        /* Check if the source port is DPDK */

> > > +        if (packets[0]->source == DPBUF_DPDK) {

> > > +            hw_ofld_dp_netdev_input(pmd, rxq, packets, cnt, port-

> >port_no);

> > > +        }

> > > +        else {

> > > +            dp_netdev_input(pmd, packets, cnt, port->port_no);

> > > +        }

> > > +#else

> > >          dp_netdev_input(pmd, packets, cnt, port->port_no);

> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +

> > >          cycles_count_end(pmd, PMD_CYCLES_PROCESSING);

> > >      } else if (error != EAGAIN && error != EOPNOTSUPP) {

> > >          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1,

> > > 5); @@ -

> > > 3321,7 +3382,6 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread

> *pmd,

> > > struct dp_packet *packet_,

> > >          flow->tunnel.metadata.present.len =

> > > orig_tunnel.metadata.present.len;

> > >          flow->tunnel.flags |= FLOW_TNL_F_UDPIF;

> > >      }

> > > -

> > >      return err;

> > >  }

> > >

> > > @@ -3430,6 +3490,7 @@ emc_processing(struct

> dp_netdev_pmd_thread

> > > *pmd, struct dp_packet **packets,

> > >      struct emc_cache *flow_cache = &pmd->flow_cache;

> > >      struct netdev_flow_key *key = &keys[0];

> > >      size_t i, n_missed = 0, n_dropped = 0;

> > > +    struct rte_mbuf *mbuf;

> > >

> > >      for (i = 0; i < cnt; i++) {

> > >          struct dp_netdev_flow *flow; @@ -3454,7 +3515,18 @@

> > > emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet

> > > **packets,

> > >          key->len = 0; /* Not computed yet. */

> > >          key->hash = dpif_netdev_packet_get_rss_hash(packet,

> > > &key->mf);

> > >

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +        mbuf = (struct rte_mbuf *)packet;

> > > +        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {

> > > +            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf, 0);

> > > +        }

> > > +        else {

> > > +            flow = emc_lookup(flow_cache, key);

> > > +        }

> > > +#else

> > >          flow = emc_lookup(flow_cache, key);

> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +

> > >          if (OVS_LIKELY(flow)) {

> > >              dp_netdev_queue_batches(packet, flow, &key->mf, batches,

> > >                                      n_batches); @@ -3651,7 +3723,7

> > > @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,

> > >      }

> > >  }

> > >

> > > -static void

> > > +void

> > >  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,

> > >                  struct dp_packet **packets, int cnt,

> > >                  odp_port_t port_no) @@ -4290,3 +4362,43 @@

> > > dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key

> > > keys[],

> > >      }

> > >      return false;                     /* Some misses. */

> > >  }

> > > +

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +/*

> > > + * EMC lookup function on 'flow id' reported by NIC.

> > > + */

> > > +const struct dp_netdev_flow *

> > > +lookup_hw_offload_flow_for_fdirid(const struct

> > > +                 dp_netdev_pmd_thread *pmd, struct rte_mbuf *mbuf,

> > > +                 uint32_t flow_id)

> > > +{

> > > +    const struct emc_cache *flow_cache = &pmd->flow_cache;

> > > +    struct netdev_flow_key key;

> > > +    struct emc_entry *current_entry;

> > > +

> > > +    key.len = 0;

> > > +    if (OVS_LIKELY(mbuf->ol_flags & PKT_RX_RSS_HASH)) {

> > > +        key.hash = mbuf->hash.rss;

> > > +    }

> > > +    else {

> > > +        return NULL;

> > > +    }

> > > +    EMC_FOR_EACH_POS_WITH_HASH(flow_cache, current_entry,

> > > key.hash) {

> > > +        if (current_entry->key.hash == key.hash

> > > +            && emc_entry_alive(current_entry)) {

> > > +            if (OVS_UNLIKELY(flow_id && dp_netdev_flow_hash(

> > > +                                       &current_entry->flow->ufid) !=

> > > +                                       flow_id)) {

> > > +                /* Hash collision in emc, fallback to software path */

> > > +                return NULL;

> > > +            }

> > > +            return current_entry->flow;

> > > +        }

> > > +    }

> > > +    /* XXX :: An improved classifier lookup needed here without any

> miniflow

> > > +     * extract to keep it performant.Until then fallback to software based

> > > +     * packet forwarding on EMC miss.

> > > +     */

> > > +     return NULL;

> > > +}

> > > +#endif /* DPDK_I40E_TNL_OFFLOAD_ENABLE */

> > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index

> > > f402354..2954f83

> > > 100644

> > > --- a/lib/netdev-dpdk.c

> > > +++ b/lib/netdev-dpdk.c

> > > @@ -56,6 +56,7 @@

> > >  #include "rte_mbuf.h"

> > >  #include "rte_meter.h"

> > >  #include "rte_virtio_net.h"

> > > +#include "dpdk-i40e-ofld.h"

> > >

> > >  VLOG_DEFINE_THIS_MODULE(dpdk);

> > >  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);

> > > @@ -112,7 +113,7 @@ static char *vhost_sock_dir = NULL;   /* Location

> of

> > > vhost-user sockets */

> > >   */

> > >  #define VHOST_ENQ_RETRY_USECS 100

> > >

> > > -static const struct rte_eth_conf port_conf = {

> > > +static struct rte_eth_conf port_conf = {

> > >      .rxmode = {

> > >          .mq_mode = ETH_MQ_RX_RSS,

> > >          .split_hdr_size = 0,

> > > @@ -331,6 +332,9 @@ struct netdev_dpdk {

> > >

> > >      /* Identifier used to distinguish vhost devices from each other */

> > >      char vhost_id[PATH_MAX];

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +    bool i40e_ofld_enable; /* hardware/NIC offload flag*/ #endif

> > > +//DPDK_I40E_TNL_OFFLOAD_ENABLE

> > >

> > >      /* In dpdk_list. */

> > >      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -346,6

> > > +350,24 @@ struct netdev_rxq_dpdk {

> > >      int port_id;

> > >  };

> > >

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE inline bool

> > > +is_i40e_ofld_enable(const struct netdev_dpdk *netdev) {

> > > +    return netdev->i40e_ofld_enable; }

> > > +

> > > +inline void set_i40e_ofld_flag(struct netdev_dpdk *netdev,

> > > +                                                bool flag) {

> > > +    netdev->i40e_ofld_enable = flag; }

> > > +

> > > +inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port) {

> > > +    return dpdk_port->port_id;

> > > +}

> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +

> > >  static bool dpdk_thread_is_pmd(void);

> > >

> > >  static int netdev_dpdk_construct(struct netdev *); @@ -539,10

> > > +561,21 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int

> > > n_rxq, int

> > > n_txq)

> > >              VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);

> > >          }

> > >

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +        diag = (!dev->i40e_ofld_enable && dev->type == DPDK_DEV_ETH)

> ?

> > > +                    dpdk_eth_dev_hw_ofld_init(dev, n_rxq, n_txq, &port_conf)

> :

> > > +                    rte_eth_dev_configure(dev->port_id,

> > > +                    n_rxq, n_txq, &port_conf);

> > > +        if (diag) {

> > > +            /* rte_dev_configure error */

> > > +            break;

> > > +        }

> > > +#else

> > >          diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,

> &port_conf);

> > >          if (diag) {

> > >              break;

> > >          }

> > > +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE

> > >

> > >          for (i = 0; i < n_txq; i++) {

> > >              diag = rte_eth_tx_queue_setup(dev->port_id, i,

> > > NIC_PORT_TX_Q_SIZE, @@ -637,7 +670,7 @@ dpdk_eth_dev_init(struct

> > > netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)

> > >      return 0;

> > >  }

> > >

> > > -static struct netdev_dpdk *

> > > +struct netdev_dpdk *

> > >  netdev_dpdk_cast(const struct netdev *netdev)  {

> > >      return CONTAINER_OF(netdev, struct netdev_dpdk, up); @@ -861,6

> > > +894,10 @@ netdev_dpdk_destruct(struct netdev *netdev_)

> > >      rte_free(dev->tx_q);

> > >      list_remove(&dev->list_node);

> > >      dpdk_mp_put(dev->dpdk_mp);

> > > +

> > > +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE

> > > +        dpdk_hw_ofld_port_release(dev); #endif /*

> > > +DPDK_I40E_TNL_OFFLOAD_ENABLE */

> > >      ovs_mutex_unlock(&dpdk_mutex);

> > >  }

> > >

> > > --

> > > 1.9.1

> >

> > _______________________________________________

> > dev mailing list

> > dev@openvswitch.org<mailto:dev@openvswitch.org>

> > http://openvswitch.org/mailman/listinfo/dev

_______________________________________________
dev mailing list
dev@openvswitch.org<mailto:dev@openvswitch.org>
http://openvswitch.org/mailman/listinfo/dev
Jesse Gross March 25, 2016, 12:38 a.m. UTC | #8
On Fri, Mar 18, 2016 at 8:50 AM, Chandran, Sugesh
<sugesh.chandran@intel.com> wrote:
> Hi Jesse,
> Please find my answers inline.
>
> Regards
> _Sugesh
>
>
>> -----Original Message-----
>> From: Jesse Gross [mailto:jesse@kernel.org]
>> Sent: Thursday, March 17, 2016 11:50 PM
>> To: Chandran, Sugesh <sugesh.chandran@intel.com>
>> Cc: dev@openvswitch.org
>> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance
>> using DPDK flow director feature.
>>
>> On Thu, Mar 17, 2016 at 3:43 PM, Chandran, Sugesh
>> <sugesh.chandran@intel.com> wrote:
>> > Hi,
>> >
>> > This patch proposes an approach that uses Flow director feature on the
>> Intel Fortville NICs to boost the VxLAN tunneling performance. In our testing
>> we verified that the VxLAN performance is almost doubled with this patch.
>> > The solution programs the NIC to report the flow ID along with the VxLAN
>> packets, and it is matched by OVS in software. There may be corner cases
>> that needs to addressed in the approach, For eg:  There is a possibility of race
>> condition where NIC reports flow ID that may match on different flow in
>> OVS. This happen when a rule is evicted by a new rule with same flowID+
>> hash in the OVS software. The packets may hit on wrong new rule in OVS
>> until the flow get deleted in the hardware too.
>> >
>> > It is a hardware specific implementation (Only work with Intel Fortville
>> NICs) for now, however the proposal works with any programmable
>> NICs.This RFC proves that the OVS can offer very high speed tunneling
>> performance using flow programmability in NICs. I am looking for
>> comments/suggestions on adding this support(such as configuring, enable it
>> for all the programmable NICs and etc) in OVS userspace datapath for
>> improving the performance.
>>
>> This is definitely very interesting to see. Can you post some more specific
>> performance numbers?
> [Sugesh]
> VxLAN DECAP performance(Unidirectional, Single flow, Single CPU Core)
> -------------------------------------------------------------------
> PKT-IN - 9.3 Mpps
> Pkt size - 114 byte VxLAN Packets(64 byte payload)
> PKT-OUT - 5.6 Mpps( Without Optimization)
> PKT-OUT - 9.3 Mpps(After the optimization, It hits the Input Line rate)
>
> VxLAN ENCAP-DECAP performance (Bidirectional, Single CPU Core)
> ---------------------------------------------------------------------------------
> PKT-IN - 9.3 Mpps, PKT SIZE - 114 Byte VxLAN Packets (64 Byte payload) -->
> PKT-IN - 14 Mpps, PKT SIZE - 64 Byte UDP packets <--
>
> PKT-OUT - 3.6 Mpps(Without Optimization)
> PKT-OUT - 5.3 Mpps(Using the patch)

Thanks, that is interesting to see, particularly for a gateway-type
use case where an appliance is translating between encapsulated and
non-encapsulated packets.

>> Is this really specific to VXLAN? I'm sure that it could be generalized to other
>> tunneling protocols (Geneve would be nice given that OVN is using it and I
>> know Fortville supports it). But shouldn't it apply to non-tunneled traffic as
>> well?
> Yes, this can be applied for any tunneling protocol provided the NIC
> hardware is programmed to handle those packets.
> We haven’t tested it for non-tunneled packets. The performance improvement on
> non-tunneled packets are subjective due to the fact that there
> is a limitation on number of hardware flows(8K on FVL), and software still has to
> spend cycles on matching the flow IDs reported by hardware.  This improves the
> tunneling performance in all the cases, because it tunnel packets needs two lookup than one.

Looking at the code some more, I think there are basically two sources
of optimization here:
 * Accelerating the EMC by avoiding netdev_flow_key_equal_mf() on the
assumption that the rule you've installed points exactly to the
correct flow. However, I don't think this is legal because the flows
that you are programming the hardware with don't capture the full set
of values in an OVS flow. For example, in the case of tunnels, there
is no match on DMAC.
 * Chaining together the multiple lookups used by tunnels on the
assumption that the outer VXLAN source port distinguishes the inner
flow. This would allow avoiding netdev_flow_key_equal_mf() a second
time. This is definitely not legal because the VXLAN source port is
only capturing a small subset of the total data that OVS is using.

Please correct me if I am wrong.

I'm not sure that I really see any advantage in using a Flow Director
perfect filter to return a software defined hash value compared to
just using the RSS hash directly as we are doing today. I think the
main case where it would be useful is if hardware wildcarding was used
to skip the EMC altogether and its size constraints. If that was done
then I think that this would no longer be specialized to VXLAN at all.

>> It looks like this is adding a hardware flow when a new flow is added to the
>> datapath. How does this affect flow setup performance?
>>
> We haven’t performed any stress tests with so many flows to verify the
> flow setup performance. What is the expectation here? Currently how many rules can be
> setup per second in OVS ?

It's hard to give a concrete number here since flow setup performance
depends on the complexity of the flow table and, of course, the
machine. In general, the goal is to avoid needing to do flow setups in
response to traffic but this depends on the use case. At a minimum, it
would be good to understand the difference in performance as a result
of this change and try to minimize any impact. Since this is really
just a hint and we'll need to deal with mismatch between software and
hardware in any case, perhaps it makes sense to program the hardware
flows asynchronously.
Chandran, Sugesh March 29, 2016, 7:43 a.m. UTC | #9
Regards
_Sugesh

> -----Original Message-----

> From: Jesse Gross [mailto:jesse@kernel.org]

> Sent: Friday, March 25, 2016 12:38 AM

> To: Chandran, Sugesh <sugesh.chandran@intel.com>

> Cc: dev@openvswitch.org

> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance

> using DPDK flow director feature.

> 

> On Fri, Mar 18, 2016 at 8:50 AM, Chandran, Sugesh

> <sugesh.chandran@intel.com> wrote:

> > Hi Jesse,

> > Please find my answers inline.

> >

> > Regards

> > _Sugesh

> >

> >

> >> -----Original Message-----

> >> From: Jesse Gross [mailto:jesse@kernel.org]

> >> Sent: Thursday, March 17, 2016 11:50 PM

> >> To: Chandran, Sugesh <sugesh.chandran@intel.com>

> >> Cc: dev@openvswitch.org

> >> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan

> >> performance using DPDK flow director feature.

> >>

> >> On Thu, Mar 17, 2016 at 3:43 PM, Chandran, Sugesh

> >> <sugesh.chandran@intel.com> wrote:

> >> > Hi,

> >> >

> >> > This patch proposes an approach that uses Flow director feature on

> >> > the

> >> Intel Fortville NICs to boost the VxLAN tunneling performance. In our

> >> testing we verified that the VxLAN performance is almost doubled with

> this patch.

> >> > The solution programs the NIC to report the flow ID along with the

> >> > VxLAN

> >> packets, and it is matched by OVS in software. There may be corner

> >> cases that needs to addressed in the approach, For eg:  There is a

> >> possibility of race condition where NIC reports flow ID that may

> >> match on different flow in OVS. This happen when a rule is evicted by

> >> a new rule with same flowID+ hash in the OVS software. The packets

> >> may hit on wrong new rule in OVS until the flow get deleted in the

> hardware too.

> >> >

> >> > It is a hardware specific implementation (Only work with Intel

> >> > Fortville

> >> NICs) for now, however the proposal works with any programmable

> >> NICs.This RFC proves that the OVS can offer very high speed tunneling

> >> performance using flow programmability in NICs. I am looking for

> >> comments/suggestions on adding this support(such as configuring,

> >> enable it for all the programmable NICs and etc) in OVS userspace

> >> datapath for improving the performance.

> >>

> >> This is definitely very interesting to see. Can you post some more

> >> specific performance numbers?

> > [Sugesh]

> > VxLAN DECAP performance(Unidirectional, Single flow, Single CPU Core)

> > -------------------------------------------------------------------

> > PKT-IN - 9.3 Mpps

> > Pkt size - 114 byte VxLAN Packets(64 byte payload) PKT-OUT - 5.6 Mpps(

> > Without Optimization) PKT-OUT - 9.3 Mpps(After the optimization, It

> > hits the Input Line rate)

> >

> > VxLAN ENCAP-DECAP performance (Bidirectional, Single CPU Core)

> > ----------------------------------------------------------------------

> > ----------- PKT-IN - 9.3 Mpps, PKT SIZE - 114 Byte VxLAN Packets (64

> > Byte payload) --> PKT-IN - 14 Mpps, PKT SIZE - 64 Byte UDP packets <--

> >

> > PKT-OUT - 3.6 Mpps(Without Optimization) PKT-OUT - 5.3 Mpps(Using the

> > patch)

> 

> Thanks, that is interesting to see, particularly for a gateway-type use case

> where an appliance is translating between encapsulated and non-

> encapsulated packets.

> 

> >> Is this really specific to VXLAN? I'm sure that it could be

> >> generalized to other tunneling protocols (Geneve would be nice given

> >> that OVN is using it and I know Fortville supports it). But shouldn't

> >> it apply to non-tunneled traffic as well?

> > Yes, this can be applied for any tunneling protocol provided the NIC

> > hardware is programmed to handle those packets.

> > We haven’t tested it for non-tunneled packets. The performance

> > improvement on non-tunneled packets are subjective due to the fact

> > that there is a limitation on number of hardware flows(8K on FVL), and

> > software still has to spend cycles on matching the flow IDs reported

> > by hardware.  This improves the tunneling performance in all the cases,

> because it tunnel packets needs two lookup than one.

> 

> Looking at the code some more, I think there are basically two sources of

> optimization here:

>  * Accelerating the EMC by avoiding netdev_flow_key_equal_mf() on the

> assumption that the rule you've installed points exactly to the correct flow.

> However, I don't think this is legal because the flows that you are

> programming the hardware with don't capture the full set of values in an OVS

> flow. For example, in the case of tunnels, there is no match on DMAC.


[Sugesh] We can program hardware to match on all the fields that we want , 
including the tunnel fields in the outer header.

>  * Chaining together the multiple lookups used by tunnels on the assumption

> that the outer VXLAN source port distinguishes the inner flow. This would

> allow avoiding netdev_flow_key_equal_mf() a second time. This is definitely

> not legal because the VXLAN source port is only capturing a small subset of

> the total data that OVS is using.


[Sugesh] From our analysis we found that optimizing one lookup give no
significant performance boost when compared with the overhead. This is  due to the
fact that the second netdev_flow_key_equal_mf() still need the tunnel information 
to match on a flow.  We found in our tests that most CPU cycles spends on extracting
header fields from the packets than lookup. 

The proposal is to avoid the header field extraction by using an additional unique software 
flow ID to match on. The two flows for tunnel are marked with this ID when installing on the
EMC. The hardware report this ID along with hash(to mitigate the hash collision in EMC) 
for every incoming packets that match on a hardware rule. This used in EMC 
along with hash to find the flow. Currently OVS compares  hash +key(from header fields) 
to match a flow. The inner flow uses the same unique ID and hardware flow flag to match 
on than the source port. We have modified the code little bit more, so that it saves the hardware
id in the matching flow, for every emc_insert.


         emc_insert(flow_cache, &keys[i], flow);
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+        struct rte_mbuf *mbuf  = (struct rte_mbuf *)packet;
+        flow->hw_rule_id = (mbuf->ol_flags &
+                                                    PKT_RX_FDIR_ID) ? mbuf->hash.fdir.hi : 0;
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> 

> Please correct me if I am wrong.

> 

> I'm not sure that I really see any advantage in using a Flow Director perfect

> filter to return a software defined hash value compared to just using the RSS

> hash directly as we are doing today. I think the main case where it would be

> useful is if hardware wildcarding was used to skip the EMC altogether and its

> size constraints. If that was done then I think that this would no longer be

> specialized to VXLAN at all.

[Sugesh] This may give performance improvement when we have 
large set of rules that overflows EMC. But for a typical use case where 80-90% rules hits EMC
doesn’t get any performance benefit out of it. Please correct me if I am wrong here.
The intention here is to optimize the tunneling performance in all the use cases.
> 

> >> It looks like this is adding a hardware flow when a new flow is added

> >> to the datapath. How does this affect flow setup performance?

> >>

> > We haven’t performed any stress tests with so many flows to verify the

> > flow setup performance. What is the expectation here? Currently how

> > many rules can be setup per second in OVS ?

> 

> It's hard to give a concrete number here since flow setup performance

> depends on the complexity of the flow table and, of course, the machine. In

> general, the goal is to avoid needing to do flow setups in response to traffic

> but this depends on the use case. At a minimum, it would be good to

> understand the difference in performance as a result of this change and try

> to minimize any impact. Since this is really just a hint and we'll need to deal

> with mismatch between software and hardware in any case, perhaps it

> makes sense to program the hardware flows asynchronously.

[Sugesh] Thank you for the input. We will test and find out the hardware
Flow programming overhead and share the results. And also will look at the possibilities of
asynchronous flow programming .
Jesse Gross March 30, 2016, 12:43 a.m. UTC | #10
On Tue, Mar 29, 2016 at 12:43 AM, Chandran, Sugesh
<sugesh.chandran@intel.com> wrote:
>> -----Original Message-----
>> From: Jesse Gross [mailto:jesse@kernel.org]
>> Sent: Friday, March 25, 2016 12:38 AM
>> To: Chandran, Sugesh <sugesh.chandran@intel.com>
>> Cc: dev@openvswitch.org
>> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance
>> using DPDK flow director feature.
>>  * Chaining together the multiple lookups used by tunnels on the assumption
>> that the outer VXLAN source port distinguishes the inner flow. This would
>> allow avoiding netdev_flow_key_equal_mf() a second time. This is definitely
>> not legal because the VXLAN source port is only capturing a small subset of
>> the total data that OVS is using.
>
> [Sugesh] From our analysis we found that optimizing one lookup give no
> significant performance boost when compared with the overhead. This is  due to the
> fact that the second netdev_flow_key_equal_mf() still need the tunnel information
> to match on a flow.  We found in our tests that most CPU cycles spends on extracting
> header fields from the packets than lookup.
>
> The proposal is to avoid the header field extraction by using an additional unique software
> flow ID to match on. The two flows for tunnel are marked with this ID when installing on the
> EMC. The hardware report this ID along with hash(to mitigate the hash collision in EMC)
> for every incoming packets that match on a hardware rule. This used in EMC
> along with hash to find the flow. Currently OVS compares  hash +key(from header fields)
> to match a flow. The inner flow uses the same unique ID and hardware flow flag to match
> on than the source port. We have modified the code little bit more, so that it saves the hardware
> id in the matching flow, for every emc_insert.

I think that the performance improvements look cool but unfortunately,
I just don't see how this can work.

There really isn't a way to avoid extracting the header fields in
software - I don't think that any NIC short of an NPU or other
programmable hardware has the capability to match on all of the fields
that OVS supports. Certainly, the UDP source port used by VXLAN and
other tunnel protocols does not contain all of the information and,
worse, it's controlled by a remote system. We can't trust the
information contained in it without further verification because OVS
flow rules are often used for security checks. I realize that in many
cases this will appear to work because for a flow represented by a
5-tuple many of the other fields will be the same. However, we can't
just make this assumption.

One possible exception to this rule is if we did an analysis on the
flows that are actually being used by OVS and only tried to extract
those fields. This is a pure software optimization that might have
similar effects to what you are observing here. This most likely makes
the most sense in the context of a BPF based datapath where the flow
extractor can be dynamically generated and compiled.

>> I'm not sure that I really see any advantage in using a Flow Director perfect
>> filter to return a software defined hash value compared to just using the RSS
>> hash directly as we are doing today. I think the main case where it would be
>> useful is if hardware wildcarding was used to skip the EMC altogether and its
>> size constraints. If that was done then I think that this would no longer be
>> specialized to VXLAN at all.
> [Sugesh] This may give performance improvement when we have
> large set of rules that overflows EMC. But for a typical use case where 80-90% rules hits EMC
> doesn’t get any performance benefit out of it. Please correct me if I am wrong here.
> The intention here is to optimize the tunneling performance in all the use cases.

To be honest, I think that last 10-20% may be more interesting. Up to
this point in time, the DPDK implementation in OVS has placed a lot of
emphasis on PPS throughput with a relatively small number of streams.
However, while this looks great on benchmarks, it doesn't necessarily
match real world use cases. Even worse, it tends to fall apart at the
worst possible times - like a DoS attack. If the NIC were able to
effectively enlarge the EMC to handle these cases then I think that
would be a huge boost to the usability of OVS on DPDK.
Ben Pfaff March 30, 2016, 3:38 p.m. UTC | #11
[adding Shahbaz]

On Tue, Mar 29, 2016 at 05:43:55PM -0700, Jesse Gross wrote:
> There really isn't a way to avoid extracting the header fields in
> software - I don't think that any NIC short of an NPU or other
> programmable hardware has the capability to match on all of the fields
> that OVS supports. Certainly, the UDP source port used by VXLAN and
> other tunnel protocols does not contain all of the information and,
> worse, it's controlled by a remote system. We can't trust the
> information contained in it without further verification because OVS
> flow rules are often used for security checks. I realize that in many
> cases this will appear to work because for a flow represented by a
> 5-tuple many of the other fields will be the same. However, we can't
> just make this assumption.
> 
> One possible exception to this rule is if we did an analysis on the
> flows that are actually being used by OVS and only tried to extract
> those fields. This is a pure software optimization that might have
> similar effects to what you are observing here. This most likely makes
> the most sense in the context of a BPF based datapath where the flow
> extractor can be dynamically generated and compiled.

This is probably going to be a side effect of the P4 support for Open
vSwitch that Shahbaz is working on.  The first iteration will probably
have the fields fixed at OVS build time, though.  Later iterations would
ideally use eBPF for the kernel and possibly a direct JIT for DPDK to
enable fields to be reconfigured at OVS runtime.
Chandran, Sugesh March 30, 2016, 5:27 p.m. UTC | #12
Regards
_Sugesh


> -----Original Message-----

> From: Jesse Gross [mailto:jesse@kernel.org]

> Sent: Wednesday, March 30, 2016 1:44 AM

> To: Chandran, Sugesh <sugesh.chandran@intel.com>

> Cc: dev@openvswitch.org

> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance

> using DPDK flow director feature.

> 

> On Tue, Mar 29, 2016 at 12:43 AM, Chandran, Sugesh

> <sugesh.chandran@intel.com> wrote:

> >> -----Original Message-----

> >> From: Jesse Gross [mailto:jesse@kernel.org]

> >> Sent: Friday, March 25, 2016 12:38 AM

> >> To: Chandran, Sugesh <sugesh.chandran@intel.com>

> >> Cc: dev@openvswitch.org

> >> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan

> >> performance using DPDK flow director feature.

> >>  * Chaining together the multiple lookups used by tunnels on the

> >> assumption that the outer VXLAN source port distinguishes the inner

> >> flow. This would allow avoiding netdev_flow_key_equal_mf() a second

> >> time. This is definitely not legal because the VXLAN source port is

> >> only capturing a small subset of the total data that OVS is using.

> >

> > [Sugesh] From our analysis we found that optimizing one lookup give no

> > significant performance boost when compared with the overhead. This is

> > due to the fact that the second netdev_flow_key_equal_mf() still need

> > the tunnel information to match on a flow.  We found in our tests that

> > most CPU cycles spends on extracting header fields from the packets than

> lookup.

> >

> > The proposal is to avoid the header field extraction by using an

> > additional unique software flow ID to match on. The two flows for

> > tunnel are marked with this ID when installing on the EMC. The

> > hardware report this ID along with hash(to mitigate the hash collision

> > in EMC) for every incoming packets that match on a hardware rule. This

> > used in EMC along with hash to find the flow. Currently OVS compares

> > hash +key(from header fields) to match a flow. The inner flow uses the

> > same unique ID and hardware flow flag to match on than the source port.

> We have modified the code little bit more, so that it saves the hardware id in

> the matching flow, for every emc_insert.

> 

> I think that the performance improvements look cool but unfortunately, I

> just don't see how this can work.

> 

> There really isn't a way to avoid extracting the header fields in software - I

> don't think that any NIC short of an NPU or other programmable hardware

> has the capability to match on all of the fields that OVS supports. Certainly,

> the UDP source port used by VXLAN and other tunnel protocols does not

> contain all of the information and, worse, it's controlled by a remote system.

> We can't trust the information contained in it without further verification

> because OVS flow rules are often used for security checks. I realize that in

> many cases this will appear to work because for a flow represented by a 5-

> tuple many of the other fields will be the same. However, we can't just make

> this assumption.


[Sugesh] Totally agree with you. How about let the OVS program the NIC only if all the 
flow fields that can be supported in NIC. If the flow has more fields than the NIC can support, 
the rules will not get programmed on the hardware. 
I feel, If we make sure the rules in NIC can validate all the fields of that corresponding software flow, 
Its possible to avoid software header extraction on those packets. Any comments??

> 

> One possible exception to this rule is if we did an analysis on the flows that

> are actually being used by OVS and only tried to extract those fields. This is a

> pure software optimization that might have similar effects to what you are

> observing here. This most likely makes the most sense in the context of a

> BPF based datapath where the flow extractor can be dynamically generated

> and compiled.

This is really interesting, Will look at this option as well to see how much it can improve 
On tunneling.

> 

> >> I'm not sure that I really see any advantage in using a Flow Director

> >> perfect filter to return a software defined hash value compared to

> >> just using the RSS hash directly as we are doing today. I think the

> >> main case where it would be useful is if hardware wildcarding was

> >> used to skip the EMC altogether and its size constraints. If that was

> >> done then I think that this would no longer be specialized to VXLAN at all.

> > [Sugesh] This may give performance improvement when we have large set

> > of rules that overflows EMC. But for a typical use case where 80-90%

> > rules hits EMC doesn’t get any performance benefit out of it. Please correct

> me if I am wrong here.

> > The intention here is to optimize the tunneling performance in all the use

> cases.

> 

> To be honest, I think that last 10-20% may be more interesting. Up to this

> point in time, the DPDK implementation in OVS has placed a lot of emphasis

> on PPS throughput with a relatively small number of streams.

> However, while this looks great on benchmarks, it doesn't necessarily match

> real world use cases. Even worse, it tends to fall apart at the worst possible

> times - like a DoS attack. If the NIC were able to effectively enlarge the EMC

> to handle these cases then I think that would be a huge boost to the usability

> of OVS on DPDK.
Jesse Gross March 30, 2016, 8:59 p.m. UTC | #13
On Wed, Mar 30, 2016 at 10:27 AM, Chandran, Sugesh
<sugesh.chandran@intel.com> wrote:
>> -----Original Message-----
>> From: Jesse Gross [mailto:jesse@kernel.org]
>> Sent: Wednesday, March 30, 2016 1:44 AM
>> To: Chandran, Sugesh <sugesh.chandran@intel.com>
>> Cc: dev@openvswitch.org
>> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan performance
>> using DPDK flow director feature.
>>
>> On Tue, Mar 29, 2016 at 12:43 AM, Chandran, Sugesh
>> <sugesh.chandran@intel.com> wrote:
>> >> -----Original Message-----
>> >> From: Jesse Gross [mailto:jesse@kernel.org]
>> >> Sent: Friday, March 25, 2016 12:38 AM
>> >> To: Chandran, Sugesh <sugesh.chandran@intel.com>
>> >> Cc: dev@openvswitch.org
>> >> Subject: Re: [ovs-dev] [RFC PATCH] tunneling: Improving vxlan
>> >> performance using DPDK flow director feature.
>> >>  * Chaining together the multiple lookups used by tunnels on the
>> >> assumption that the outer VXLAN source port distinguishes the inner
>> >> flow. This would allow avoiding netdev_flow_key_equal_mf() a second
>> >> time. This is definitely not legal because the VXLAN source port is
>> >> only capturing a small subset of the total data that OVS is using.
>> >
>> > [Sugesh] From our analysis we found that optimizing one lookup give no
>> > significant performance boost when compared with the overhead. This is
>> > due to the fact that the second netdev_flow_key_equal_mf() still need
>> > the tunnel information to match on a flow.  We found in our tests that
>> > most CPU cycles spends on extracting header fields from the packets than
>> lookup.
>> >
>> > The proposal is to avoid the header field extraction by using an
>> > additional unique software flow ID to match on. The two flows for
>> > tunnel are marked with this ID when installing on the EMC. The
>> > hardware report this ID along with hash(to mitigate the hash collision
>> > in EMC) for every incoming packets that match on a hardware rule. This
>> > used in EMC along with hash to find the flow. Currently OVS compares
>> > hash +key(from header fields) to match a flow. The inner flow uses the
>> > same unique ID and hardware flow flag to match on than the source port.
>> We have modified the code little bit more, so that it saves the hardware id in
>> the matching flow, for every emc_insert.
>>
>> I think that the performance improvements look cool but unfortunately, I
>> just don't see how this can work.
>>
>> There really isn't a way to avoid extracting the header fields in software - I
>> don't think that any NIC short of an NPU or other programmable hardware
>> has the capability to match on all of the fields that OVS supports. Certainly,
>> the UDP source port used by VXLAN and other tunnel protocols does not
>> contain all of the information and, worse, it's controlled by a remote system.
>> We can't trust the information contained in it without further verification
>> because OVS flow rules are often used for security checks. I realize that in
>> many cases this will appear to work because for a flow represented by a 5-
>> tuple many of the other fields will be the same. However, we can't just make
>> this assumption.
>
> [Sugesh] Totally agree with you. How about let the OVS program the NIC only if all the
> flow fields that can be supported in NIC. If the flow has more fields than the NIC can support,
> the rules will not get programmed on the hardware.
> I feel, If we make sure the rules in NIC can validate all the fields of that corresponding software flow,
> Its possible to avoid software header extraction on those packets. Any comments??

I think that would work although I would definitely start with a pure
software version that extracts only the relevant header fields first.
Once that is done, it would be good to compare that with a NIC-based
version and weigh it against the complexity of the implementation and
whether there are common use cases than can take advantage of a more
restricted set of flows that could be offloaded. However, I really
think that the wildcarded lookups for EMC misses are the most
interesting case for offloading to the NIC and that has the advantage
of not requiring all fields to be supported by the NIC.
diff mbox

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index 27a1669..da48479 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -366,6 +366,8 @@  endif
 
 if DPDK_NETDEV
 lib_libopenvswitch_la_SOURCES += \
+       lib/dpdk-i40e-ofld.c \
+       lib/dpdk-i40e-ofld.h \
        lib/netdev-dpdk.c \
        lib/netdev-dpdk.h
 endif
diff --git a/lib/dpdk-i40e-ofld.c b/lib/dpdk-i40e-ofld.c
new file mode 100644
index 0000000..3ea7084
--- /dev/null
+++ b/lib/dpdk-i40e-ofld.c
@@ -0,0 +1,266 @@ 
+/*
+ * Copyright (c) 2016 Intel Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "dpdk-i40e-ofld.h"
+#include "errno.h"
+#include "ovs-thread.h"
+#include "openvswitch/vlog.h"
+#include "netdev-provider.h"
+#include "rte_pci_dev_ids.h"
+#include "rte_ethdev.h"
+
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+VLOG_DEFINE_THIS_MODULE(dpdk_hw_ofld);
+
+#define VXLAN_DST_PORT          4789
+#define VXLAN_HLEN                  50
+#define MAX_FDIR_RULES          8000
+
+static uint32_t total_fdir_ids;
+static struct ovs_mutex hw_ofld_mutex = OVS_MUTEX_INITIALIZER;
+
+/*
+ * Returns '0' if FDIR IDs reaches max limit. Only 8000 entries are
+ * supported in FVL.
+ */
+static inline uint32_t
+i40e_fdir_entry_cnt_inc(void)
+{
+    if (total_fdir_ids < MAX_FDIR_RULES) {
+        ovs_mutex_lock(&hw_ofld_mutex);
+        total_fdir_ids++;
+        ovs_mutex_unlock(&hw_ofld_mutex);
+        return (total_fdir_ids);
+    }
+    return 0;
+}
+
+static inline void
+i40e_fdir_entry_cnt_decr(void)
+{
+    ovs_mutex_lock(&hw_ofld_mutex);
+    total_fdir_ids ? total_fdir_ids-- : 0;
+    ovs_mutex_unlock(&hw_ofld_mutex);
+}
+
+/*
+ * Release the hardware offloading functionality from the dpdk-port.
+ */
+int
+dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port)
+{
+    ovs_mutex_lock(&hw_ofld_mutex);
+    set_i40e_ofld_flag(dpdk_port, 0);
+    ovs_mutex_unlock(&hw_ofld_mutex);
+    return 0;
+}
+
+int
+dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev,
+                                        int n_rxq, int n_txq,
+                                        struct rte_eth_conf *port_conf)
+{
+    int err = 0;
+    struct rte_eth_dev_info info;
+    uint16_t vendor_id, device_id;
+
+    rte_eth_dev_info_get(get_dpdk_port_id(dev), &info);
+    vendor_id = info.pci_dev->id.vendor_id;
+    device_id = info.pci_dev->id.device_id;
+    /* Configure vxlan offload only if its FVL NIC */
+    if (vendor_id != PCI_VENDOR_ID_INTEL || device_id !=
+                                            I40E_DEV_ID_SFP_XL710) {
+        ovs_mutex_lock(&hw_ofld_mutex);
+        set_i40e_ofld_flag(dev, 0);
+        ovs_mutex_unlock(&hw_ofld_mutex);
+        err = rte_eth_dev_configure(get_dpdk_port_id(dev),
+                                    n_rxq, n_txq, port_conf);
+        return err;
+    }
+    ovs_mutex_lock(&hw_ofld_mutex);
+    set_i40e_ofld_flag(dev, 1);
+    ovs_mutex_unlock(&hw_ofld_mutex);
+    /* Configure FVL FDIR VxLAN tunnel handing */
+    port_conf->fdir_conf.mode = RTE_FDIR_MODE_PERFECT;
+    port_conf->fdir_conf.flex_conf.nb_payloads = 1;
+    port_conf->fdir_conf.flex_conf.flex_set[0].type = RTE_ETH_L4_PAYLOAD;
+    /* Need to initilize all the 16 flex bytes,no matter;
+     * what we really using, possibly a DPDK bug?? */
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[0] = 0;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[1] = 1;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[2] = 2;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[3] = 3;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[4] = 4;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[5] = 5;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[6] = 6;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[7] = 7;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[8] = 8;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[9] = 9;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[10] = 10;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[11] = 11;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[12] = 12;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[13] = 13;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[14] = 14;
+    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[15] = 15;
+    err = rte_eth_dev_configure(get_dpdk_port_id(dev),
+                                n_rxq, n_txq, port_conf);
+    if (err) {
+        VLOG_ERR("Failed to configure DPDK port with hardware offload");
+        return err;
+    }
+    /*Clean all FDIR entries if any */
+    err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(dev),
+            RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_FLUSH, NULL);
+    return err;
+}
+
+/*
+ * Install rules for VxLAN packets in hardware
+ */
+int
+set_up_hw_offload_port_rule(struct netdev *netdev__,
+                                const struct flow *flow,
+                                const uint32_t hw_flow_id,
+                                const bool is_add_rule)
+{
+    int err = 0;
+    uint8_t flexbytes[RTE_ETH_FDIR_MAX_FLEXLEN] = { 0 };
+    uint32_t *vni;
+    enum rte_filter_op filter_op;
+    struct rte_eth_fdir_filter entry = { 0 };
+    struct netdev_dpdk *netdev;
+
+    netdev = netdev_dpdk_cast(netdev__);
+    if (is_i40e_ofld_enable(netdev)) {
+        entry.soft_id = hw_flow_id;
+        if (!entry.soft_id) {
+            VLOG_DBG("Invalid flow ID, Cant install rule in the NIC for "
+                             "hardware offload");
+            err = ECANCELED;
+            return err;
+        }
+        /* Install rules in NIC only for VxLAN flows */
+        if (ntohs(flow->tp_dst) != VXLAN_DST_PORT) {
+            return 0;
+        }
+        entry.action.flex_off = 0;  /* use 0 by default */
+        entry.input.flow_ext.vlan_tci = 0; //! ignored by i40e fdir
+        entry.action.behavior = RTE_ETH_FDIR_PASSTHRU;
+        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID_FLEX_4;
+        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
+        entry.input.flow.ip4_flow.src_ip = flow->nw_src;
+        entry.input.flow.ip4_flow.dst_ip = flow->nw_dst;
+        entry.input.flow.udp4_flow.dst_port = htons(VXLAN_DST_PORT);
+        entry.input.flow.udp4_flow.src_port = flow->tp_src;
+        vni = (uint32_t *)&flexbytes[4];
+        *vni = flow->tunnel.tun_id << 8;
+        memcpy(entry.input.flow_ext.flexbytes, flexbytes,
+                      RTE_ETH_FDIR_MAX_FLEXLEN);
+        entry.action.rx_queue = 0;
+        filter_op = is_add_rule ? RTE_ETH_FILTER_ADD :
+                                              RTE_ETH_FILTER_DELETE;
+        err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(netdev),
+                 RTE_ETH_FILTER_FDIR, filter_op, &entry);
+
+        /*
+         * XXX : Delayed the max limit check for flow director entries after
+         * the configuration. Anyway the rte_eth_dev_filter_ctrl will fail if
+         * max limit reaches. This can be used for tracking.
+         */
+        if (is_add_rule) {
+            if (!i40e_fdir_entry_cnt_inc()) {
+                VLOG_DBG("Cant configure rule on NIC, Flow director "
+                        "entries hits max limit");
+            }
+        }
+        else {
+            i40e_fdir_entry_cnt_decr();
+        }
+        if (err < 0) {
+            VLOG_DBG("flow director programming error in NIC: (%d)\n", err);
+            return err;
+        }
+    }
+    return err;
+}
+
+static int
+i40e_dpdk_port_get_hw_ofld_pkts(struct
+                 dp_netdev_pmd_thread *pmd, struct dp_packet
+                 **in_packets, struct dp_packet **hw_packets,
+                 struct dp_packet **non_hw_packets,
+                 uint32_t cnt)
+{
+    int i, hw_pkt_cnt = 0, norm_pkt_cnt = 0;
+    const struct dp_netdev_flow *flow;
+    struct rte_mbuf *mbuf;
+
+    for (i = 0; i < cnt; i++) {
+        mbuf = (struct rte_mbuf *)in_packets[i];
+        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
+            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf,
+                                                     mbuf->hash.fdir.hi);
+            if (!flow) {
+                /* Bogus flow in hw, cannot find it in OVS EMC */
+                mbuf->ol_flags &= ~PKT_RX_FDIR_ID;
+                non_hw_packets[norm_pkt_cnt++] = in_packets[i];
+                continue;
+            }
+            dp_packet_reset_packet(in_packets[i], VXLAN_HLEN);
+            mbuf->ol_flags |= PKT_RX_RSS_HASH;
+            mbuf->hash.rss = hash_finish(mbuf->hash.rss, 1);
+            hw_packets[hw_pkt_cnt++] = in_packets[i];
+        }
+        else {
+            non_hw_packets[norm_pkt_cnt++] = in_packets[i];
+        }
+    }
+    return hw_pkt_cnt;
+}
+
+/*
+ * Process the packets based on hardware offload configuration
+ */
+void
+hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
+                             struct netdev_rxq *netdev_rxq,
+                             struct dp_packet **packets, int cnt,
+                             odp_port_t port_no)
+{
+    int hw_pkt_cnt;
+    struct dp_packet *hw_ofld_packets[NETDEV_MAX_BURST] = { 0 };
+    struct dp_packet *orig_packets[NETDEV_MAX_BURST] = { 0 };
+    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_rxq->netdev);
+
+    if (is_i40e_ofld_enable(netdev)) {
+        hw_pkt_cnt = i40e_dpdk_port_get_hw_ofld_pkts(pmd, packets,
+                                                          hw_ofld_packets,
+                                                          orig_packets, cnt);
+        /* Process packet streams separately. */
+        if (hw_pkt_cnt) {
+            dp_netdev_input(pmd, hw_ofld_packets, hw_pkt_cnt, port_no);
+        }
+        if (cnt - hw_pkt_cnt) {
+            dp_netdev_input(pmd, orig_packets, (cnt - hw_pkt_cnt), port_no);
+        }
+    }
+    else {
+        dp_netdev_input(pmd, packets, cnt, port_no);
+    }
+}
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
diff --git a/lib/dpdk-i40e-ofld.h b/lib/dpdk-i40e-ofld.h
new file mode 100644
index 0000000..1aad246
--- /dev/null
+++ b/lib/dpdk-i40e-ofld.h
@@ -0,0 +1,59 @@ 
+/*
+ * Copyright (c) 2016 Intel Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DPDK_I40E_OFLD_H_
+#define DPDK_I40E_OFLD_H_
+
+#include <config.h>
+
+#include "dp-packet.h"
+#include "netdev.h"
+
+/*
+ * Macro to enable/disable HW OFFLOAD feature for DPDK.
+ * 1 :- Enable HW_OFFLOAD support in OVS
+ * 0 :- Disable HW_OFFLOAD support in OVS
+ */
+#define DPDK_I40E_TNL_OFFLOAD_ENABLE        1
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+
+struct netdev_dpdk;
+struct dp_netdev_pmd_thread;
+struct dp_netdev_flow;
+
+struct netdev_dpdk *netdev_dpdk_cast(const struct netdev *netdev);
+extern inline bool is_i40e_ofld_enable(const struct netdev_dpdk *netdev);
+extern inline void set_i40e_ofld_flag(struct netdev_dpdk *netdev, bool flag);
+extern inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port);
+int dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev, int n_rxq, int n_txq,
+                              struct rte_eth_conf *port_conf);
+int dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port);
+int set_up_hw_offload_port_rule(struct netdev *netdev__,
+                                const struct flow *flow,
+                                const uint32_t hw_flow_id,
+                                const bool is_add_rule);
+void hw_ofld_dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
+                             struct netdev_rxq *netdev_rxq,
+                             struct dp_packet **packets, int cnt,
+                             odp_port_t port_no);
+const struct dp_netdev_flow *lookup_hw_offload_flow_for_fdirid(
+                            const struct dp_netdev_pmd_thread *pmd,
+                            struct rte_mbuf *mbuf, uint32_t flow_id);
+void dp_netdev_input(struct dp_netdev_pmd_thread *, struct dp_packet **, 
+                     int cnt, odp_port_t port_no);
+
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
+#endif /* DPDK_I40E_OFLD_H_ */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index cf574ad..d79b239 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -70,6 +70,7 @@ 
 #include "util.h"
 
 #include "openvswitch/vlog.h"
+#include "dpdk-i40e-ofld.h"
 
 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
 
@@ -478,7 +479,7 @@  static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                                       bool may_steal,
                                       const struct nlattr *actions,
                                       size_t actions_len);
-static void dp_netdev_input(struct dp_netdev_pmd_thread *,
+void dp_netdev_input(struct dp_netdev_pmd_thread *,
                             struct dp_packet **, int cnt, odp_port_t port_no);
 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
                                   struct dp_packet **, int cnt);
@@ -1455,6 +1456,28 @@  dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
     flow->dead = true;
 
     dp_netdev_flow_unref(flow);
+
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+    struct dp_netdev_port *dp_port;
+    int err;
+    odp_port_t in_port = flow->flow.in_port.odp_port;
+    err = get_port_by_number(pmd->dp, in_port, &dp_port);
+    if (err) {
+        VLOG_WARN("Cannot get the port information, hardware offload may "
+                "not be functional");
+        return;
+    }
+    if(strcmp(dp_port->type, "dpdk")) {
+        /* No hardware offload on a non-DPDK port") */
+        return;
+    }
+    /* Remove the hardware offload rule if exists.*/
+    if(set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
+            dp_netdev_flow_hash(&(flow->ufid)), 0)) {
+        VLOG_DBG("Failed to delete the hardware offload rule");
+        return;
+    }
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
 }
 
 static void
@@ -2059,6 +2082,32 @@  dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
         ds_destroy(&ds);
     }
 
+    /*
+     * Configure the hardware offload for tunnel while flows are getting
+     * inserted in OVS.
+     */
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+    struct dp_netdev_port *dp_port;
+    int err;
+    odp_port_t in_port = flow->flow.in_port.odp_port;
+    err = get_port_by_number(pmd->dp, in_port, &dp_port);
+    if (err) {
+        VLOG_WARN("Cannot get the port information, Failed to configure "
+                            "hardware offload");
+        goto out;
+    }
+    if (strcmp(dp_port->type, "dpdk")) {
+        /* No hardware offload on a non-DPDK port */
+        goto out;
+    }
+    /* install the rule in hw, reduntant might overwrite if it exists*/
+    if (set_up_hw_offload_port_rule(dp_port->netdev, &flow->flow,
+            dp_netdev_flow_hash(&flow->ufid), 1)) {
+        VLOG_ERR("Failed to install the hardware offload rule");
+        goto out;
+    }
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
+out:
     return flow;
 }
 
@@ -2575,7 +2624,19 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
         *recirc_depth_get() = 0;
 
         cycles_count_start(pmd);
+
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+        /* Check if the source port is DPDK */
+        if (packets[0]->source == DPBUF_DPDK) {
+            hw_ofld_dp_netdev_input(pmd, rxq, packets, cnt, port->port_no);
+        }
+        else {
+            dp_netdev_input(pmd, packets, cnt, port->port_no);
+        }
+#else
         dp_netdev_input(pmd, packets, cnt, port->port_no);
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
+
         cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
     } else if (error != EAGAIN && error != EOPNOTSUPP) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
@@ -3321,7 +3382,6 @@  dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
         flow->tunnel.metadata.present.len = orig_tunnel.metadata.present.len;
         flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
     }
-
     return err;
 }
 
@@ -3430,6 +3490,7 @@  emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
     struct emc_cache *flow_cache = &pmd->flow_cache;
     struct netdev_flow_key *key = &keys[0];
     size_t i, n_missed = 0, n_dropped = 0;
+    struct rte_mbuf *mbuf;
 
     for (i = 0; i < cnt; i++) {
         struct dp_netdev_flow *flow;
@@ -3454,7 +3515,18 @@  emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
         key->len = 0; /* Not computed yet. */
         key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
 
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+        mbuf = (struct rte_mbuf *)packet;
+        if (mbuf->ol_flags & PKT_RX_FDIR_ID) {
+            flow = lookup_hw_offload_flow_for_fdirid(pmd, mbuf, 0);
+        }
+        else {
+            flow = emc_lookup(flow_cache, key);
+        }
+#else
         flow = emc_lookup(flow_cache, key);
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
+
         if (OVS_LIKELY(flow)) {
             dp_netdev_queue_batches(packet, flow, &key->mf, batches,
                                     n_batches);
@@ -3651,7 +3723,7 @@  dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
     }
 }
 
-static void
+void
 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
                 struct dp_packet **packets, int cnt,
                 odp_port_t port_no)
@@ -4290,3 +4362,43 @@  dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
     }
     return false;                     /* Some misses. */
 }
+
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+/*
+ * EMC lookup function on 'flow id' reported by NIC.
+ */
+const struct dp_netdev_flow *
+lookup_hw_offload_flow_for_fdirid(const struct
+                 dp_netdev_pmd_thread *pmd, struct rte_mbuf *mbuf,
+                 uint32_t flow_id)
+{
+    const struct emc_cache *flow_cache = &pmd->flow_cache;
+    struct netdev_flow_key key;
+    struct emc_entry *current_entry;
+
+    key.len = 0;
+    if (OVS_LIKELY(mbuf->ol_flags & PKT_RX_RSS_HASH)) {
+        key.hash = mbuf->hash.rss;
+    }
+    else {
+        return NULL;
+    }
+    EMC_FOR_EACH_POS_WITH_HASH(flow_cache, current_entry, key.hash) {
+        if (current_entry->key.hash == key.hash
+            && emc_entry_alive(current_entry)) {
+            if (OVS_UNLIKELY(flow_id && dp_netdev_flow_hash(
+                                       &current_entry->flow->ufid) !=
+                                       flow_id)) {
+                /* Hash collision in emc, fallback to software path */
+                return NULL;
+            }
+            return current_entry->flow;
+        }
+    }
+    /* XXX :: An improved classifier lookup needed here without any miniflow
+     * extract to keep it performant.Until then fallback to software based
+     * packet forwarding on EMC miss.
+     */
+     return NULL;
+}
+#endif /* DPDK_I40E_TNL_OFFLOAD_ENABLE */
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index f402354..2954f83 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -56,6 +56,7 @@ 
 #include "rte_mbuf.h"
 #include "rte_meter.h"
 #include "rte_virtio_net.h"
+#include "dpdk-i40e-ofld.h"
 
 VLOG_DEFINE_THIS_MODULE(dpdk);
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
@@ -112,7 +113,7 @@  static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
  */
 #define VHOST_ENQ_RETRY_USECS 100
 
-static const struct rte_eth_conf port_conf = {
+static struct rte_eth_conf port_conf = {
     .rxmode = {
         .mq_mode = ETH_MQ_RX_RSS,
         .split_hdr_size = 0,
@@ -331,6 +332,9 @@  struct netdev_dpdk {
 
     /* Identifier used to distinguish vhost devices from each other */
     char vhost_id[PATH_MAX];
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+    bool i40e_ofld_enable; /* hardware/NIC offload flag*/
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
 
     /* In dpdk_list. */
     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
@@ -346,6 +350,24 @@  struct netdev_rxq_dpdk {
     int port_id;
 };
 
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+inline bool is_i40e_ofld_enable(const struct netdev_dpdk *netdev)
+{
+    return netdev->i40e_ofld_enable;
+}
+
+inline void set_i40e_ofld_flag(struct netdev_dpdk *netdev,
+                                                bool flag)
+{
+    netdev->i40e_ofld_enable = flag;
+}
+
+inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port)
+{
+    return dpdk_port->port_id;
+}
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
+
 static bool dpdk_thread_is_pmd(void);
 
 static int netdev_dpdk_construct(struct netdev *);
@@ -539,10 +561,21 @@  dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
             VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
         }
 
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+        diag = (!dev->i40e_ofld_enable && dev->type == DPDK_DEV_ETH) ?
+                    dpdk_eth_dev_hw_ofld_init(dev, n_rxq, n_txq, &port_conf) :
+                    rte_eth_dev_configure(dev->port_id,
+                    n_rxq, n_txq, &port_conf);
+        if (diag) {
+            /* rte_dev_configure error */
+            break;
+        }
+#else
         diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
         if (diag) {
             break;
         }
+#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
 
         for (i = 0; i < n_txq; i++) {
             diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
@@ -637,7 +670,7 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
     return 0;
 }
 
-static struct netdev_dpdk *
+struct netdev_dpdk *
 netdev_dpdk_cast(const struct netdev *netdev)
 {
     return CONTAINER_OF(netdev, struct netdev_dpdk, up);
@@ -861,6 +894,10 @@  netdev_dpdk_destruct(struct netdev *netdev_)
     rte_free(dev->tx_q);
     list_remove(&dev->list_node);
     dpdk_mp_put(dev->dpdk_mp);
+
+#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
+        dpdk_hw_ofld_port_release(dev);
+#endif /* DPDK_I40E_TNL_OFFLOAD_ENABLE */
     ovs_mutex_unlock(&dpdk_mutex);
 }