diff mbox

[ovs-dev] tunneling: Improving VxLAN tunneling performance using DPDK Rx checksum offloading feature.

Message ID 1460368330-138563-1-git-send-email-sugesh.chandran@intel.com
State Changes Requested
Headers show

Commit Message

Chandran, Sugesh April 11, 2016, 9:52 a.m. UTC
Optimizing VxLAN tunneling performance in userspace datapath by offloading
the rx checksum validation on tunnel packets to the NIC when it is supported.

Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
---
 lib/dpif-netdev.c            | 73 ++++++++++++++++++++++++----------
 lib/dpif.c                   |  5 ++-
 lib/netdev-dpdk.c            | 31 +++++++++++++--
 lib/netdev-dpdk.h            | 17 ++++++++
 lib/netdev-provider.h        |  2 +-
 lib/netdev-vport.c           | 95 ++++++++++++++++++++++++++++++++++++++++++--
 lib/netdev.c                 |  5 ++-
 lib/netdev.h                 |  2 +-
 lib/odp-execute.c            |  6 +--
 lib/odp-execute.h            |  6 ++-
 ofproto/ofproto-dpif-xlate.c |  3 +-
 11 files changed, 205 insertions(+), 40 deletions(-)

Comments

Chandran, Sugesh April 11, 2016, 10:35 a.m. UTC | #1
Hi, 

 This patch is using the Rx checksum offloading feature in the NIC to validate the checksum for tunnel packets. In our testing we found that the DECAP performance is improved upto 25% and bidirectional performance is improved almost 10%. And also this patch alleviate OVS from spending cycles on corrupted packets by discard them at the input without any further processing.

Regards
_Sugesh

> -----Original Message-----
> From: Chandran, Sugesh
> Sent: Monday, April 11, 2016 10:52 AM
> To: dev@openvswitch.org
> Cc: Chandran, Sugesh <sugesh.chandran@intel.com>
> Subject: [PATCH] tunneling: Improving VxLAN tunneling performance using
> DPDK Rx checksum offloading feature.
> 
> Optimizing VxLAN tunneling performance in userspace datapath by
> offloading the rx checksum validation on tunnel packets to the NIC when it is
> supported.
> 
> Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
> ---
>  lib/dpif-netdev.c            | 73 ++++++++++++++++++++++++----------
>  lib/dpif.c                   |  5 ++-
>  lib/netdev-dpdk.c            | 31 +++++++++++++--
>  lib/netdev-dpdk.h            | 17 ++++++++
>  lib/netdev-provider.h        |  2 +-
>  lib/netdev-vport.c           | 95
> ++++++++++++++++++++++++++++++++++++++++++--
>  lib/netdev.c                 |  5 ++-
>  lib/netdev.h                 |  2 +-
>  lib/odp-execute.c            |  6 +--
>  lib/odp-execute.h            |  6 ++-
>  ofproto/ofproto-dpif-xlate.c |  3 +-
>  11 files changed, 205 insertions(+), 40 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 2870951..d8ab8b5
> 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -477,9 +477,11 @@ static void dp_netdev_execute_actions(struct
> dp_netdev_pmd_thread *pmd,
>                                        struct dp_packet **, int c,
>                                        bool may_steal,
>                                        const struct nlattr *actions,
> -                                      size_t actions_len);
> +                                      size_t actions_len,
> +                                      uint32_t rx_ol_flags);
>  static void dp_netdev_input(struct dp_netdev_pmd_thread *,
> -                            struct dp_packet **, int cnt, odp_port_t port_no);
> +                                      struct dp_packet **, int cnt,
> +                                      struct dp_netdev_port *port);
>  static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
>                                    struct dp_packet **, int cnt);
> 
> @@ -2369,7 +2371,7 @@ dpif_netdev_execute(struct dpif *dpif, struct
> dpif_execute *execute)
> 
>      pp = execute->packet;
>      dp_netdev_execute_actions(pmd, &pp, 1, false, execute->actions,
> -                              execute->actions_len);
> +                              execute->actions_len, 0);
>      if (pmd->core_id == NON_PMD_CORE_ID) {
>          dp_netdev_pmd_unref(pmd);
>          ovs_mutex_unlock(&dp->port_mutex);
> @@ -2572,7 +2574,7 @@ dp_netdev_process_rxq_port(struct
> dp_netdev_pmd_thread *pmd,
>          *recirc_depth_get() = 0;
> 
>          cycles_count_start(pmd);
> -        dp_netdev_input(pmd, packets, cnt, port->port_no);
> +        dp_netdev_input(pmd, packets, cnt, port);
>          cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
>      } else if (error != EAGAIN && error != EOPNOTSUPP) {
>          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); @@ -
> 3365,7 +3367,7 @@ packet_batch_init(struct packet_batch *batch, struct
> dp_netdev_flow *flow)  static inline void  packet_batch_execute(struct
> packet_batch *batch,
>                       struct dp_netdev_pmd_thread *pmd,
> -                     long long now)
> +                     long long now, uint32_t rx_ol_flags)
>  {
>      struct dp_netdev_actions *actions;
>      struct dp_netdev_flow *flow = batch->flow; @@ -3376,7 +3378,7 @@
> packet_batch_execute(struct packet_batch *batch,
>      actions = dp_netdev_flow_get_actions(flow);
> 
>      dp_netdev_execute_actions(pmd, batch->packets, batch->packet_count,
> true,
> -                              actions->actions, actions->size);
> +                              actions->actions, actions->size,
> + rx_ol_flags);
>  }
> 
>  static inline void
> @@ -3394,6 +3396,19 @@ dp_netdev_queue_batches(struct dp_packet
> *pkt,
>      packet_batch_update(batch, pkt, mf);  }
> 
> +static inline bool
> +is_checksum_valid(struct dp_packet *packet) { #ifdef DPDK_NETDEV
> +    if (packet->mbuf.ol_flags & (PKT_RX_IP_CKSUM_BAD |
> +                                 PKT_RX_L4_CKSUM_BAD)) {
> +        return 0;
> +    }
> +    return 1;
> +#else
> +    return 0;
> +#endif
> +}
> +
>  /* Try to process all ('cnt') the 'packets' using only the exact match cache
>   * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
>   * miniflow is copied into 'keys' and the packet pointer is moved at the @@ -
> 3409,7 +3424,8 @@ static inline size_t  emc_processing(struct
> dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
>                 size_t cnt, struct netdev_flow_key *keys,
>                 struct packet_batch batches[], size_t *n_batches,
> -               bool md_is_valid, odp_port_t port_no)
> +               bool md_is_valid, struct dp_netdev_port *port,
> +               uint32_t rx_checksum_ofld_capa)
>  {
>      struct emc_cache *flow_cache = &pmd->flow_cache;
>      struct netdev_flow_key *key = &keys[0]; @@ -3425,6 +3441,13 @@
> emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet
> **packets,
>              continue;
>          }
> 
> +        if (OVS_UNLIKELY(rx_checksum_ofld_capa &&
> +                         !is_checksum_valid(packet))) {
> +            dp_packet_delete(packet);
> +            n_dropped++;
> +            continue;
> +        }
> +
>          if (i != cnt - 1) {
>              /* Prefetch next packet data and metadata. */
>              OVS_PREFETCH(dp_packet_data(packets[i+1]));
> @@ -3432,7 +3455,7 @@ emc_processing(struct dp_netdev_pmd_thread
> *pmd, struct dp_packet **packets,
>          }
> 
>          if (!md_is_valid) {
> -            pkt_metadata_init(&packet->md, port_no);
> +            pkt_metadata_init(&packet->md, port->port_no);
>          }
>          miniflow_extract(packet, &key->mf);
>          key->len = 0; /* Not computed yet. */ @@ -3462,7 +3485,8 @@ static
> inline void  fast_path_processing(struct dp_netdev_pmd_thread *pmd,
>                       struct dp_packet **packets, size_t cnt,
>                       struct netdev_flow_key *keys,
> -                     struct packet_batch batches[], size_t *n_batches)
> +                     struct packet_batch batches[], size_t *n_batches,
> +                     uint32_t rx_ol_flags)
>  {
>  #if !defined(__CHECKER__) && !defined(_WIN32)
>      const size_t PKT_ARRAY_SIZE = cnt;
> @@ -3541,7 +3565,7 @@ fast_path_processing(struct
> dp_netdev_pmd_thread *pmd,
>               * the actions.  Otherwise, if there are any slow path actions,
>               * we'll send the packet up twice. */
>              dp_netdev_execute_actions(pmd, &packets[i], 1, true,
> -                                      actions.data, actions.size);
> +                                      actions.data, actions.size,
> + rx_ol_flags);
> 
>              add_actions = put_actions.size ? &put_actions : &actions;
>              if (OVS_LIKELY(error != ENOSPC)) { @@ -3606,7 +3630,7 @@
> fast_path_processing(struct dp_netdev_pmd_thread *pmd,  static void
> dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
>                    struct dp_packet **packets, int cnt,
> -                  bool md_is_valid, odp_port_t port_no)
> +                  bool md_is_valid, struct dp_netdev_port *port)
>  {
>  #if !defined(__CHECKER__) && !defined(_WIN32)
>      const size_t PKT_ARRAY_SIZE = cnt;
> @@ -3618,12 +3642,18 @@ dp_netdev_input__(struct
> dp_netdev_pmd_thread *pmd,
>      struct packet_batch batches[PKT_ARRAY_SIZE];
>      long long now = time_msec();
>      size_t newcnt, n_batches, i;
> +    uint32_t rx_ol_flags;
> 
>      n_batches = 0;
> +    /* Collect the Rx offloading features only if its DPDK port */
> +    rx_ol_flags = port && (packets[0]->source == DPBUF_DPDK) ?
> +                            netdev_get_rx_offload_capa(port->netdev) :
> +                            0;
>      newcnt = emc_processing(pmd, packets, cnt, keys, batches, &n_batches,
> -                            md_is_valid, port_no);
> +                            md_is_valid, port, rx_ol_flags);
>      if (OVS_UNLIKELY(newcnt)) {
> -        fast_path_processing(pmd, packets, newcnt, keys, batches,
> &n_batches);
> +        fast_path_processing(pmd, packets, newcnt, keys, batches,
> +                                           &n_batches, rx_ol_flags);
>      }
> 
>      for (i = 0; i < n_batches; i++) {
> @@ -3631,16 +3661,16 @@ dp_netdev_input__(struct
> dp_netdev_pmd_thread *pmd,
>      }
> 
>      for (i = 0; i < n_batches; i++) {
> -        packet_batch_execute(&batches[i], pmd, now);
> +        packet_batch_execute(&batches[i], pmd, now, rx_ol_flags);
>      }
>  }
> 
>  static void
>  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
>                  struct dp_packet **packets, int cnt,
> -                odp_port_t port_no)
> +                struct dp_netdev_port *port)
>  {
> -     dp_netdev_input__(pmd, packets, cnt, false, port_no);
> +     dp_netdev_input__(pmd, packets, cnt, false, port);
>  }
> 
>  static void
> @@ -3716,7 +3746,7 @@ dp_netdev_clone_pkt_batch(struct dp_packet
> **dst_pkts,
> 
>  static void
>  dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt,
> -              const struct nlattr *a, bool may_steal)
> +              const struct nlattr *a, bool may_steal, uint32_t
> + rx_ol_flags)
>      OVS_NO_THREAD_SAFETY_ANALYSIS
>  {
>      struct dp_netdev_execute_aux *aux = aux_; @@ -3776,7 +3806,7 @@
> dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt,
>                     packets = tnl_pkt;
>                  }
> 
> -                err = netdev_pop_header(p->netdev, packets, cnt);
> +                err = netdev_pop_header(p->netdev, packets, cnt,
> + rx_ol_flags);
>                  if (!err) {
> 
>                      for (i = 0; i < cnt; i++) { @@ -3816,7 +3846,7 @@
> dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt,
>                                           NULL);
>                  if (!error || error == ENOSPC) {
>                      dp_netdev_execute_actions(pmd, &packets[i], 1, may_steal,
> -                                              actions.data, actions.size);
> +                                              actions.data,
> + actions.size, 0);
>                  } else if (may_steal) {
>                      dp_packet_delete(packets[i]);
>                  }
> @@ -3878,12 +3908,13 @@ static void
>  dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
>                            struct dp_packet **packets, int cnt,
>                            bool may_steal,
> -                          const struct nlattr *actions, size_t actions_len)
> +                          const struct nlattr *actions, size_t actions_len,
> +                          uint32_t rx_ol_flags)
>  {
>      struct dp_netdev_execute_aux aux = { pmd };
> 
>      odp_execute_actions(&aux, packets, cnt, may_steal, actions,
> -                        actions_len, dp_execute_cb);
> +                        actions_len, dp_execute_cb, rx_ol_flags);
>  }
> 
>  const struct dpif_class dpif_netdev_class = { diff --git a/lib/dpif.c b/lib/dpif.c
> index a784de7..760719b 100644
> --- a/lib/dpif.c
> +++ b/lib/dpif.c
> @@ -1088,7 +1088,8 @@ struct dpif_execute_helper_aux {
>   * meaningful. */
>  static void
>  dpif_execute_helper_cb(void *aux_, struct dp_packet **packets, int cnt,
> -                       const struct nlattr *action, bool may_steal OVS_UNUSED)
> +                       const struct nlattr *action, bool may_steal OVS_UNUSED,
> +                       uint32_t rx_ol_flags)
>  {
>      struct dpif_execute_helper_aux *aux = aux_;
>      int type = nl_attr_type(action);
> @@ -1167,7 +1168,7 @@ dpif_execute_with_help(struct dpif *dpif, struct
> dpif_execute *execute)
> 
>      pp = execute->packet;
>      odp_execute_actions(&aux, &pp, 1, false, execute->actions,
> -                        execute->actions_len, dpif_execute_helper_cb);
> +                        execute->actions_len, dpif_execute_helper_cb,
> + 0);
>      return aux.error;
>  }
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index c7217ea..5e951c7
> 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -321,6 +321,10 @@ struct netdev_dpdk {
>      struct rte_eth_link link;
>      int link_reset_cnt;
> 
> +    /* RX offload capability bitmask, RX checksum offloading is only
> +     * in use now */
> +    uint32_t rx_ol_flags;
> +
>      /* The user might request more txqs than the NIC has.  We remap those
>       * ('up.n_txq') on these ('real_n_txq').
>       * If the numbers match, 'txq_needs_locking' is false, otherwise it is @@ -
> 527,7 +531,8 @@ dpdk_watchdog(void *dummy OVS_UNUSED)  }
> 
>  static int
> -dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int
> n_txq)
> +dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int
> n_txq,
> +                         const struct rte_eth_conf *new_port_conf)
>  {
>      int diag = 0;
>      int i;
> @@ -542,7 +547,8 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk
> *dev, int n_rxq, int n_txq)
>              VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
>          }
> 
> -        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,
> &port_conf);
> +        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,
> +                                     new_port_conf);
>          if (diag) {
>              break;
>          }
> @@ -596,6 +602,7 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> OVS_REQUIRES(dpdk_mutex)
>      struct rte_pktmbuf_pool_private *mbp_priv;
>      struct rte_eth_dev_info info;
>      struct ether_addr eth_addr;
> +    struct rte_eth_conf new_port_conf;
>      int diag;
>      int n_rxq, n_txq;
> 
> @@ -608,7 +615,17 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> OVS_REQUIRES(dpdk_mutex)
>      n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
>      n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> 
> -    diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
> +    new_port_conf = port_conf;
> +    /* Enable rx checksum offload if it is supported by the NIC */
> +    if (info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) {
> +        dev->rx_ol_flags = info.rx_offload_capa &
> +                           (DEV_RX_OFFLOAD_UDP_CKSUM |
> +                            DEV_RX_OFFLOAD_TCP_CKSUM |
> +                            DEV_RX_OFFLOAD_IPV4_CKSUM);
> +        new_port_conf.rxmode.hw_ip_checksum = 1;
> +    }
> +
> +    diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq,
> &new_port_conf);
>      if (diag) {
>          VLOG_ERR("Interface %s(rxq:%d txq:%d) configure error: %s",
>                   dev->up.name, n_rxq, n_txq, rte_strerror(-diag)); @@ -725,6
> +742,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>      netdev->n_rxq = NR_QUEUE;
>      netdev->requested_n_rxq = NR_QUEUE;
>      dev->real_n_txq = NR_QUEUE;
> +    dev->rx_ol_flags = 0;
> 
>      if (type == DPDK_DEV_ETH) {
>          netdev_dpdk_alloc_txq(dev, NR_QUEUE); @@ -2927,3 +2945,10 @@
> dpdk_thread_is_pmd(void)  {
>      return rte_lcore_id() != NON_PMD_CORE_ID;  }
> +
> +uint32_t
> +netdev_get_rx_offload_capa(struct netdev *netdev_) {
> +    struct netdev_dpdk *netdev;
> +    netdev = netdev_dpdk_cast(netdev_);
> +    return netdev->rx_ol_flags;
> +}
> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index 646d3e2..341d560
> 100644
> --- a/lib/netdev-dpdk.h
> +++ b/lib/netdev-dpdk.h
> @@ -19,6 +19,7 @@ struct dp_packet;
>  #include <rte_spinlock.h>
>  #include <rte_launch.h>
>  #include <rte_malloc.h>
> +#include "netdev-provider.h"
> 
>  #define NON_PMD_CORE_ID LCORE_ID_ANY
> 
> @@ -26,6 +27,12 @@ int dpdk_init(int argc, char **argv);  void
> netdev_dpdk_register(void);  void free_dpdk_buf(struct dp_packet *);  int
> pmd_thread_setaffinity_cpu(unsigned cpu);
> +uint32_t netdev_get_rx_offload_capa(struct netdev *netdev_);
> +
> +static inline uint32_t
> +get_checksum_ofld_flags(uint32_t rx_ol_flags) {
> +    return rx_ol_flags & DEV_RX_OFFLOAD_IPV4_CKSUM; }
> 
>  #else
> 
> @@ -60,5 +67,15 @@ pmd_thread_setaffinity_cpu(unsigned cpu
> OVS_UNUSED)
>      return 0;
>  }
> 
> +static uint32_t
> +netdev_get_rx_offload_capa(struct netdev *netdev_) {
> +    return 0;
> +}
> +
> +static inline uint32_t
> +get_checksum_ofld_flags(uint32_t rx_ol_flags) {
> +    return 0;
> +}
> +
>  #endif /* DPDK_NETDEV */
>  #endif
> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index
> cda25eb..8f829e6 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> @@ -276,7 +276,7 @@ struct netdev_class {
> 
>      /* Pop tunnel header from packet, build tunnel metadata and resize
> packet
>       * for further processing. */
> -    int (*pop_header)(struct dp_packet *packet);
> +    int (*pop_header)(struct dp_packet *packet, uint32_t rx_ol_flags);
> 
>      /* Returns the id of the numa node the 'netdev' is on.  If there is no
>       * such info, returns NETDEV_NUMA_UNSPEC. */ diff --git a/lib/netdev-
> vport.c b/lib/netdev-vport.c index e398562..feae11b 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
> @@ -873,6 +873,74 @@ ipv6_hdr(void *eth)  }
> 
>  static void *
> +ip_extract_tnl_md_with_valid_checksum(struct dp_packet *packet,
> +                            struct flow_tnl *tnl, unsigned int *hlen) {
> +    void *nh;
> +    struct ip_header *ip;
> +    struct ovs_16aligned_ip6_hdr *ip6;
> +    void *l4;
> +    int l3_size;
> +
> +    nh = dp_packet_l3(packet);
> +    ip = nh;
> +    ip6 = nh;
> +    l4 = dp_packet_l4(packet);
> +
> +    if (!nh || !l4) {
> +        return NULL;
> +    }
> +
> +    *hlen = sizeof(struct eth_header);
> +
> +    l3_size = dp_packet_size(packet) -
> +                ((char *)nh - (char *)dp_packet_data(packet));
> +
> +    if (IP_VER(ip->ip_ihl_ver) == 4) {
> +
> +        ovs_be32 ip_src, ip_dst;
> +
> +        if (ntohs(ip->ip_tot_len) > l3_size) {
> +            VLOG_WARN_RL(&err_rl,
> +                         "ip packet is truncated (IP length %d, actual %d)",
> +                         ntohs(ip->ip_tot_len), l3_size);
> +            return NULL;
> +        }
> +        if (IP_IHL(ip->ip_ihl_ver) * 4 > sizeof(struct ip_header)) {
> +            VLOG_WARN_RL(&err_rl, "ip options not supported on tunnel
> packets "
> +                                  "(%d bytes)", IP_IHL(ip->ip_ihl_ver) * 4);
> +            return NULL;
> +        }
> +
> +        ip_src = get_16aligned_be32(&ip->ip_src);
> +        ip_dst = get_16aligned_be32(&ip->ip_dst);
> +
> +        tnl->ip_src = ip_src;
> +        tnl->ip_dst = ip_dst;
> +        tnl->ip_tos = ip->ip_tos;
> +        tnl->ip_ttl = ip->ip_ttl;
> +
> +        *hlen += IP_HEADER_LEN;
> +
> +    } else if (IP_VER(ip->ip_ihl_ver) == 6) {
> +
> +        memcpy(tnl->ipv6_src.s6_addr, ip6->ip6_src.be16, sizeof ip6->ip6_src);
> +        memcpy(tnl->ipv6_dst.s6_addr, ip6->ip6_dst.be16, sizeof ip6-
> >ip6_dst);
> +        tnl->ip_tos = 0;
> +        tnl->ip_ttl = ip6->ip6_hlim;
> +
> +        *hlen += IPV6_HEADER_LEN;
> +
> +    } else {
> +        VLOG_WARN_RL(&err_rl, "ipv4 packet has invalid version (%d)",
> +                                              IP_VER(ip->ip_ihl_ver));
> +        return NULL;
> +    }
> +
> +    return l4;
> +}
> +
> +static void *
>  ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
>                    unsigned int *hlen)
>  {
> @@ -989,6 +1057,23 @@ push_ip_header(struct dp_packet *packet,  }
> 
>  static void *
> +udp_extract_tnl_md_with_valid_checksum(struct dp_packet *packet,
> +        struct flow_tnl *tnl, unsigned int *hlen) {
> +    struct udp_header *udp;
> +
> +    udp = ip_extract_tnl_md_with_valid_checksum(packet, tnl, hlen);
> +    if (!udp) {
> +        return NULL;
> +    }
> +    tnl->flags |= FLOW_TNL_F_CSUM;
> +    tnl->tp_src = udp->udp_src;
> +    tnl->tp_dst = udp->udp_dst;
> +
> +    return udp + 1;
> +}
> +
> +static void *
>  udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
>                     unsigned int *hlen)
>  {
> @@ -1183,7 +1268,7 @@ pkt_metadata_init_tnl(struct pkt_metadata *md)  }
> 
>  static int
> -netdev_gre_pop_header(struct dp_packet *packet)
> +netdev_gre_pop_header(struct dp_packet *packet, uint32_t rx_ol_flags)
>  {
>      struct pkt_metadata *md = &packet->md;
>      struct flow_tnl *tnl = &md->tunnel; @@ -1280,19 +1365,21 @@
> netdev_gre_build_header(const struct netdev *netdev,  }
> 
>  static int
> -netdev_vxlan_pop_header(struct dp_packet *packet)
> +netdev_vxlan_pop_header(struct dp_packet *packet, uint32_t rx_ol_flags)
>  {
>      struct pkt_metadata *md = &packet->md;
>      struct flow_tnl *tnl = &md->tunnel;
>      struct vxlanhdr *vxh;
>      unsigned int hlen;
> 
> +    rx_ol_flags = get_checksum_ofld_flags(rx_ol_flags);
>      pkt_metadata_init_tnl(md);
>      if (VXLAN_HLEN > dp_packet_l4_size(packet)) {
>          return EINVAL;
>      }
> 
> -    vxh = udp_extract_tnl_md(packet, tnl, &hlen);
> +    vxh = rx_ol_flags ? udp_extract_tnl_md_with_valid_checksum(packet,
> tnl,
> +                        &hlen) : udp_extract_tnl_md(packet, tnl,
> + &hlen);
>      if (!vxh) {
>          return EINVAL;
>      }
> @@ -1338,7 +1425,7 @@ netdev_vxlan_build_header(const struct netdev
> *netdev,  }
> 
>  static int
> -netdev_geneve_pop_header(struct dp_packet *packet)
> +netdev_geneve_pop_header(struct dp_packet *packet, uint32_t
> +rx_ol_flags)
>  {
>      struct pkt_metadata *md = &packet->md;
>      struct flow_tnl *tnl = &md->tunnel; diff --git a/lib/netdev.c b/lib/netdev.c
> index 3e50694..d1494fd 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -767,7 +767,8 @@ netdev_send(struct netdev *netdev, int qid, struct
> dp_packet **buffers,  }
> 
>  int
> -netdev_pop_header(struct netdev *netdev, struct dp_packet **buffers,
> int cnt)
> +netdev_pop_header(struct netdev *netdev, struct dp_packet **buffers,
> int cnt,
> +                  uint32_t rx_ol_flags)
>  {
>      int i;
> 
> @@ -778,7 +779,7 @@ netdev_pop_header(struct netdev *netdev, struct
> dp_packet **buffers, int cnt)
>      for (i = 0; i < cnt; i++) {
>          int err;
> 
> -        err = netdev->netdev_class->pop_header(buffers[i]);
> +        err = netdev->netdev_class->pop_header(buffers[i],
> + rx_ol_flags);
>          if (err) {
>              dp_packet_clear(buffers[i]);
>          }
> diff --git a/lib/netdev.h b/lib/netdev.h index 05968b2..6de0381 100644
> --- a/lib/netdev.h
> +++ b/lib/netdev.h
> @@ -193,7 +193,7 @@ int netdev_push_header(const struct netdev
> *netdev,
>                         struct dp_packet **buffers, int cnt,
>                         const struct ovs_action_push_tnl *data);  int
> netdev_pop_header(struct netdev *netdev, struct dp_packet **buffers,
> -                      int cnt);
> +                      int cnt, uint32_t rx_ol_flags);
> 
>  /* Hardware address. */
>  int netdev_set_etheraddr(struct netdev *, const struct eth_addr mac); diff -
> -git a/lib/odp-execute.c b/lib/odp-execute.c index b5204b2..06a0c30 100644
> --- a/lib/odp-execute.c
> +++ b/lib/odp-execute.c
> @@ -477,7 +477,7 @@ odp_execute_sample(void *dp, struct dp_packet
> *packet, bool steal,
>      }
> 
>      odp_execute_actions(dp, &packet, 1, steal, nl_attr_get(subactions),
> -                        nl_attr_get_size(subactions), dp_execute_action);
> +                        nl_attr_get_size(subactions),
> + dp_execute_action, 0);
>  }
> 
>  static bool
> @@ -516,7 +516,7 @@ requires_datapath_assistance(const struct nlattr *a)
> void  odp_execute_actions(void *dp, struct dp_packet **packets, int cnt,
> bool steal,
>                      const struct nlattr *actions, size_t actions_len,
> -                    odp_execute_cb dp_execute_action)
> +                    odp_execute_cb dp_execute_action, uint32_t
> + rx_ol_flags)
>  {
>      const struct nlattr *a;
>      unsigned int left;
> @@ -532,7 +532,7 @@ odp_execute_actions(void *dp, struct dp_packet
> **packets, int cnt, bool steal,
>                   * not need it any more. */
>                  bool may_steal = steal && last_action;
> 
> -                dp_execute_action(dp, packets, cnt, a, may_steal);
> +                dp_execute_action(dp, packets, cnt, a, may_steal,
> + rx_ol_flags);
> 
>                  if (last_action) {
>                      /* We do not need to free the packets. dp_execute_actions() diff
> --git a/lib/odp-execute.h b/lib/odp-execute.h index c602bb4..49e587b
> 100644
> --- a/lib/odp-execute.h
> +++ b/lib/odp-execute.h
> @@ -28,7 +28,8 @@ struct dp_packet;
>  struct pkt_metadata;
> 
>  typedef void (*odp_execute_cb)(void *dp, struct dp_packet **packets, int
> cnt,
> -                               const struct nlattr *action, bool may_steal);
> +                               const struct nlattr *action, bool may_steal,
> +                               uint32_t rx_ol_flags);
> 
>  /* Actions that need to be executed in the context of a datapath are handed
>   * to 'dp_execute_action', if non-NULL.  Currently this is called only for @@ -
> 37,5 +38,6 @@ typedef void (*odp_execute_cb)(void *dp, struct dp_packet
> **packets, int cnt,  void odp_execute_actions(void *dp, struct dp_packet
> **packets, int cnt,
>                           bool steal,
>                           const struct nlattr *actions, size_t actions_len,
> -                         odp_execute_cb dp_execute_action);
> +                         odp_execute_cb dp_execute_action,
> +                         uint32_t rx_ol_flags);
>  #endif
> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index
> a02dc24..848a94a 100644
> --- a/ofproto/ofproto-dpif-xlate.c
> +++ b/ofproto/ofproto-dpif-xlate.c
> @@ -3593,7 +3593,8 @@ execute_controller_action(struct xlate_ctx *ctx, int
> len,
>      packet = dp_packet_clone(ctx->xin->packet);
> 
>      odp_execute_actions(NULL, &packet, 1, false,
> -                        ctx->odp_actions->data, ctx->odp_actions->size, NULL);
> +                        ctx->odp_actions->data, ctx->odp_actions->size, NULL,
> +                        0);
> 
>      /* A packet sent by an action in a table-miss rule is considered an
>       * explicit table miss.  OpenFlow before 1.3 doesn't have that concept so
> --
> 2.5.0
Jesse Gross April 11, 2016, 4:32 p.m. UTC | #2
On Mon, Apr 11, 2016 at 2:52 AM, Sugesh Chandran
<sugesh.chandran@intel.com> wrote:
> Optimizing VxLAN tunneling performance in userspace datapath by offloading
> the rx checksum validation on tunnel packets to the NIC when it is supported.
>
> Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>

Thanks for working on this. I think there is a lot of room for OVS
DPDK to take advantage of common NIC offloads.

I have some high level comments:

* I noticed that you've been following up you patches with a second
message with performance numbers. I think it would be better to just
include this in the patch description since that's what will become
the commit message. Even if the numbers might change over time, it's
useful to have a reference to what the expected performance benefit
was at the time the patch was authored. Related to this, it would also
be useful to have more context on the benchmark numbers that you are
providing. For example, I'm guessing that the test was run using VXLAN
with outer UDP checksums? What NIC was used?

* I think we need to have a clearer way to pass around checksum state
rather than as an argument, it's just too invasive. A natural place
that most OSs use is in metadata associate with the packet itself,
which would be struct dp_packet in this case. It's also important to
consider how this will be expanded over time. For example, offloading
of transmit checksums would likely want to use the same fields.

* Please find a way to do this without duplicating the tunnel header
extraction code. Having two copies will surely result in bugs over
time. Potentially the previous point may help with this.

* Please try to generalize your code where possible. In this case it
looks like IPv6, Geneve, and GRE (at least the IP checksum) could
easily be supported using the same mechanism.
Chandran, Sugesh April 13, 2016, 2:38 p.m. UTC | #3
Hi Jesse,
Thank you for looking into the patch. I will send out v2 patch after incorporating your comments.

Regards
_Sugesh

> -----Original Message-----

> From: Jesse Gross [mailto:jesse@kernel.org]

> Sent: Monday, April 11, 2016 5:33 PM

> To: Chandran, Sugesh <sugesh.chandran@intel.com>

> Cc: ovs dev <dev@openvswitch.org>

> Subject: Re: [ovs-dev] [PATCH] tunneling: Improving VxLAN tunneling

> performance using DPDK Rx checksum offloading feature.

> 

> On Mon, Apr 11, 2016 at 2:52 AM, Sugesh Chandran

> <sugesh.chandran@intel.com> wrote:

> > Optimizing VxLAN tunneling performance in userspace datapath by

> > offloading the rx checksum validation on tunnel packets to the NIC when it

> is supported.

> >

> > Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>

> 

> Thanks for working on this. I think there is a lot of room for OVS DPDK to take

> advantage of common NIC offloads.

> 

> I have some high level comments:

> 

> * I noticed that you've been following up you patches with a second message

> with performance numbers. I think it would be better to just include this in

> the patch description since that's what will become the commit message.

> Even if the numbers might change over time, it's useful to have a reference

> to what the expected performance benefit was at the time the patch was

> authored. Related to this, it would also be useful to have more context on

> the benchmark numbers that you are providing. For example, I'm guessing

> that the test was run using VXLAN with outer UDP checksums? What NIC was

> used?

[Sugesh] Yes, its VxLAN with outer UDP checksums. I have used Intel Niantic 10G NICs.
The test setup is as follows
NIC <--> OVS<-->NIC

64 Byte UDP stream (14 million) -->
114 Byte VxLAN stream(9.3 Million) <--

Results :
(Unidirectional, on DECAP side)
6.53 Million(Before Patch)
8.17 Million(After Patch)

Bidirectional
4.14 Million (Before patch, on each side)
4.43 Million(After the patch, on each side)


> 

> * I think we need to have a clearer way to pass around checksum state rather

> than as an argument, it's just too invasive. A natural place that most OSs use

> is in metadata associate with the packet itself, which would be struct

> dp_packet in this case. It's also important to consider how this will be

> expanded over time. For example, offloading of transmit checksums would

> likely want to use the same fields.

> 

> * Please find a way to do this without duplicating the tunnel header

> extraction code. Having two copies will surely result in bugs over time.

> Potentially the previous point may help with this.

> 

> * Please try to generalize your code where possible. In this case it looks like

> IPv6, Geneve, and GRE (at least the IP checksum) could easily be supported

> using the same mechanism.

[Sugesh] Modified the code to use the packet metadata and same header 
extraction functions. However we notice 1-2% of less in performance improvement
due to the overhead of packet metadata handling  and validations in the extraction code.
And also in both case we noticed a~1% of performance drop in PHY-PHY bidirectional test.
This is because of the new invalid checksum flag check on every packet 
from the NIC.
diff mbox

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2870951..d8ab8b5 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -477,9 +477,11 @@  static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                                       struct dp_packet **, int c,
                                       bool may_steal,
                                       const struct nlattr *actions,
-                                      size_t actions_len);
+                                      size_t actions_len,
+                                      uint32_t rx_ol_flags);
 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
-                            struct dp_packet **, int cnt, odp_port_t port_no);
+                                      struct dp_packet **, int cnt,
+                                      struct dp_netdev_port *port);
 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
                                   struct dp_packet **, int cnt);
 
@@ -2369,7 +2371,7 @@  dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
 
     pp = execute->packet;
     dp_netdev_execute_actions(pmd, &pp, 1, false, execute->actions,
-                              execute->actions_len);
+                              execute->actions_len, 0);
     if (pmd->core_id == NON_PMD_CORE_ID) {
         dp_netdev_pmd_unref(pmd);
         ovs_mutex_unlock(&dp->port_mutex);
@@ -2572,7 +2574,7 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
         *recirc_depth_get() = 0;
 
         cycles_count_start(pmd);
-        dp_netdev_input(pmd, packets, cnt, port->port_no);
+        dp_netdev_input(pmd, packets, cnt, port);
         cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
     } else if (error != EAGAIN && error != EOPNOTSUPP) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
@@ -3365,7 +3367,7 @@  packet_batch_init(struct packet_batch *batch, struct dp_netdev_flow *flow)
 static inline void
 packet_batch_execute(struct packet_batch *batch,
                      struct dp_netdev_pmd_thread *pmd,
-                     long long now)
+                     long long now, uint32_t rx_ol_flags)
 {
     struct dp_netdev_actions *actions;
     struct dp_netdev_flow *flow = batch->flow;
@@ -3376,7 +3378,7 @@  packet_batch_execute(struct packet_batch *batch,
     actions = dp_netdev_flow_get_actions(flow);
 
     dp_netdev_execute_actions(pmd, batch->packets, batch->packet_count, true,
-                              actions->actions, actions->size);
+                              actions->actions, actions->size, rx_ol_flags);
 }
 
 static inline void
@@ -3394,6 +3396,19 @@  dp_netdev_queue_batches(struct dp_packet *pkt,
     packet_batch_update(batch, pkt, mf);
 }
 
+static inline bool
+is_checksum_valid(struct dp_packet *packet) {
+#ifdef DPDK_NETDEV
+    if (packet->mbuf.ol_flags & (PKT_RX_IP_CKSUM_BAD |
+                                 PKT_RX_L4_CKSUM_BAD)) {
+        return 0;
+    }
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 /* Try to process all ('cnt') the 'packets' using only the exact match cache
  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
  * miniflow is copied into 'keys' and the packet pointer is moved at the
@@ -3409,7 +3424,8 @@  static inline size_t
 emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
                size_t cnt, struct netdev_flow_key *keys,
                struct packet_batch batches[], size_t *n_batches,
-               bool md_is_valid, odp_port_t port_no)
+               bool md_is_valid, struct dp_netdev_port *port,
+               uint32_t rx_checksum_ofld_capa)
 {
     struct emc_cache *flow_cache = &pmd->flow_cache;
     struct netdev_flow_key *key = &keys[0];
@@ -3425,6 +3441,13 @@  emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
             continue;
         }
 
+        if (OVS_UNLIKELY(rx_checksum_ofld_capa &&
+                         !is_checksum_valid(packet))) {
+            dp_packet_delete(packet);
+            n_dropped++;
+            continue;
+        }
+
         if (i != cnt - 1) {
             /* Prefetch next packet data and metadata. */
             OVS_PREFETCH(dp_packet_data(packets[i+1]));
@@ -3432,7 +3455,7 @@  emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
         }
 
         if (!md_is_valid) {
-            pkt_metadata_init(&packet->md, port_no);
+            pkt_metadata_init(&packet->md, port->port_no);
         }
         miniflow_extract(packet, &key->mf);
         key->len = 0; /* Not computed yet. */
@@ -3462,7 +3485,8 @@  static inline void
 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
                      struct dp_packet **packets, size_t cnt,
                      struct netdev_flow_key *keys,
-                     struct packet_batch batches[], size_t *n_batches)
+                     struct packet_batch batches[], size_t *n_batches,
+                     uint32_t rx_ol_flags)
 {
 #if !defined(__CHECKER__) && !defined(_WIN32)
     const size_t PKT_ARRAY_SIZE = cnt;
@@ -3541,7 +3565,7 @@  fast_path_processing(struct dp_netdev_pmd_thread *pmd,
              * the actions.  Otherwise, if there are any slow path actions,
              * we'll send the packet up twice. */
             dp_netdev_execute_actions(pmd, &packets[i], 1, true,
-                                      actions.data, actions.size);
+                                      actions.data, actions.size, rx_ol_flags);
 
             add_actions = put_actions.size ? &put_actions : &actions;
             if (OVS_LIKELY(error != ENOSPC)) {
@@ -3606,7 +3630,7 @@  fast_path_processing(struct dp_netdev_pmd_thread *pmd,
 static void
 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
                   struct dp_packet **packets, int cnt,
-                  bool md_is_valid, odp_port_t port_no)
+                  bool md_is_valid, struct dp_netdev_port *port)
 {
 #if !defined(__CHECKER__) && !defined(_WIN32)
     const size_t PKT_ARRAY_SIZE = cnt;
@@ -3618,12 +3642,18 @@  dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
     struct packet_batch batches[PKT_ARRAY_SIZE];
     long long now = time_msec();
     size_t newcnt, n_batches, i;
+    uint32_t rx_ol_flags;
 
     n_batches = 0;
+    /* Collect the Rx offloading features only if its DPDK port */
+    rx_ol_flags = port && (packets[0]->source == DPBUF_DPDK) ?
+                            netdev_get_rx_offload_capa(port->netdev) :
+                            0;
     newcnt = emc_processing(pmd, packets, cnt, keys, batches, &n_batches,
-                            md_is_valid, port_no);
+                            md_is_valid, port, rx_ol_flags);
     if (OVS_UNLIKELY(newcnt)) {
-        fast_path_processing(pmd, packets, newcnt, keys, batches, &n_batches);
+        fast_path_processing(pmd, packets, newcnt, keys, batches,
+                                           &n_batches, rx_ol_flags);
     }
 
     for (i = 0; i < n_batches; i++) {
@@ -3631,16 +3661,16 @@  dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
     }
 
     for (i = 0; i < n_batches; i++) {
-        packet_batch_execute(&batches[i], pmd, now);
+        packet_batch_execute(&batches[i], pmd, now, rx_ol_flags);
     }
 }
 
 static void
 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
                 struct dp_packet **packets, int cnt,
-                odp_port_t port_no)
+                struct dp_netdev_port *port)
 {
-     dp_netdev_input__(pmd, packets, cnt, false, port_no);
+     dp_netdev_input__(pmd, packets, cnt, false, port);
 }
 
 static void
@@ -3716,7 +3746,7 @@  dp_netdev_clone_pkt_batch(struct dp_packet **dst_pkts,
 
 static void
 dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt,
-              const struct nlattr *a, bool may_steal)
+              const struct nlattr *a, bool may_steal, uint32_t rx_ol_flags)
     OVS_NO_THREAD_SAFETY_ANALYSIS
 {
     struct dp_netdev_execute_aux *aux = aux_;
@@ -3776,7 +3806,7 @@  dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt,
                    packets = tnl_pkt;
                 }
 
-                err = netdev_pop_header(p->netdev, packets, cnt);
+                err = netdev_pop_header(p->netdev, packets, cnt, rx_ol_flags);
                 if (!err) {
 
                     for (i = 0; i < cnt; i++) {
@@ -3816,7 +3846,7 @@  dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt,
                                          NULL);
                 if (!error || error == ENOSPC) {
                     dp_netdev_execute_actions(pmd, &packets[i], 1, may_steal,
-                                              actions.data, actions.size);
+                                              actions.data, actions.size, 0);
                 } else if (may_steal) {
                     dp_packet_delete(packets[i]);
                 }
@@ -3878,12 +3908,13 @@  static void
 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                           struct dp_packet **packets, int cnt,
                           bool may_steal,
-                          const struct nlattr *actions, size_t actions_len)
+                          const struct nlattr *actions, size_t actions_len,
+                          uint32_t rx_ol_flags)
 {
     struct dp_netdev_execute_aux aux = { pmd };
 
     odp_execute_actions(&aux, packets, cnt, may_steal, actions,
-                        actions_len, dp_execute_cb);
+                        actions_len, dp_execute_cb, rx_ol_flags);
 }
 
 const struct dpif_class dpif_netdev_class = {
diff --git a/lib/dpif.c b/lib/dpif.c
index a784de7..760719b 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1088,7 +1088,8 @@  struct dpif_execute_helper_aux {
  * meaningful. */
 static void
 dpif_execute_helper_cb(void *aux_, struct dp_packet **packets, int cnt,
-                       const struct nlattr *action, bool may_steal OVS_UNUSED)
+                       const struct nlattr *action, bool may_steal OVS_UNUSED,
+                       uint32_t rx_ol_flags)
 {
     struct dpif_execute_helper_aux *aux = aux_;
     int type = nl_attr_type(action);
@@ -1167,7 +1168,7 @@  dpif_execute_with_help(struct dpif *dpif, struct dpif_execute *execute)
 
     pp = execute->packet;
     odp_execute_actions(&aux, &pp, 1, false, execute->actions,
-                        execute->actions_len, dpif_execute_helper_cb);
+                        execute->actions_len, dpif_execute_helper_cb, 0);
     return aux.error;
 }
 
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index c7217ea..5e951c7 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -321,6 +321,10 @@  struct netdev_dpdk {
     struct rte_eth_link link;
     int link_reset_cnt;
 
+    /* RX offload capability bitmask, RX checksum offloading is only
+     * in use now */
+    uint32_t rx_ol_flags;
+
     /* The user might request more txqs than the NIC has.  We remap those
      * ('up.n_txq') on these ('real_n_txq').
      * If the numbers match, 'txq_needs_locking' is false, otherwise it is
@@ -527,7 +531,8 @@  dpdk_watchdog(void *dummy OVS_UNUSED)
 }
 
 static int
-dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
+dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq,
+                         const struct rte_eth_conf *new_port_conf)
 {
     int diag = 0;
     int i;
@@ -542,7 +547,8 @@  dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
             VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
         }
 
-        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
+        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,
+                                     new_port_conf);
         if (diag) {
             break;
         }
@@ -596,6 +602,7 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
     struct rte_pktmbuf_pool_private *mbp_priv;
     struct rte_eth_dev_info info;
     struct ether_addr eth_addr;
+    struct rte_eth_conf new_port_conf;
     int diag;
     int n_rxq, n_txq;
 
@@ -608,7 +615,17 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
     n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
     n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
 
-    diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
+    new_port_conf = port_conf;
+    /* Enable rx checksum offload if it is supported by the NIC */
+    if (info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) {
+        dev->rx_ol_flags = info.rx_offload_capa &
+                           (DEV_RX_OFFLOAD_UDP_CKSUM |
+                            DEV_RX_OFFLOAD_TCP_CKSUM |
+                            DEV_RX_OFFLOAD_IPV4_CKSUM);
+        new_port_conf.rxmode.hw_ip_checksum = 1;
+    }
+
+    diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq, &new_port_conf);
     if (diag) {
         VLOG_ERR("Interface %s(rxq:%d txq:%d) configure error: %s",
                  dev->up.name, n_rxq, n_txq, rte_strerror(-diag));
@@ -725,6 +742,7 @@  netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
     netdev->n_rxq = NR_QUEUE;
     netdev->requested_n_rxq = NR_QUEUE;
     dev->real_n_txq = NR_QUEUE;
+    dev->rx_ol_flags = 0;
 
     if (type == DPDK_DEV_ETH) {
         netdev_dpdk_alloc_txq(dev, NR_QUEUE);
@@ -2927,3 +2945,10 @@  dpdk_thread_is_pmd(void)
 {
     return rte_lcore_id() != NON_PMD_CORE_ID;
 }
+
+uint32_t
+netdev_get_rx_offload_capa(struct netdev *netdev_) {
+    struct netdev_dpdk *netdev;
+    netdev = netdev_dpdk_cast(netdev_);
+    return netdev->rx_ol_flags;
+}
diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
index 646d3e2..341d560 100644
--- a/lib/netdev-dpdk.h
+++ b/lib/netdev-dpdk.h
@@ -19,6 +19,7 @@  struct dp_packet;
 #include <rte_spinlock.h>
 #include <rte_launch.h>
 #include <rte_malloc.h>
+#include "netdev-provider.h"
 
 #define NON_PMD_CORE_ID LCORE_ID_ANY
 
@@ -26,6 +27,12 @@  int dpdk_init(int argc, char **argv);
 void netdev_dpdk_register(void);
 void free_dpdk_buf(struct dp_packet *);
 int pmd_thread_setaffinity_cpu(unsigned cpu);
+uint32_t netdev_get_rx_offload_capa(struct netdev *netdev_);
+
+static inline uint32_t
+get_checksum_ofld_flags(uint32_t rx_ol_flags) {
+    return rx_ol_flags & DEV_RX_OFFLOAD_IPV4_CKSUM;
+}
 
 #else
 
@@ -60,5 +67,15 @@  pmd_thread_setaffinity_cpu(unsigned cpu OVS_UNUSED)
     return 0;
 }
 
+static uint32_t
+netdev_get_rx_offload_capa(struct netdev *netdev_) {
+    return 0;
+}
+
+static inline uint32_t
+get_checksum_ofld_flags(uint32_t rx_ol_flags) {
+    return 0;
+}
+
 #endif /* DPDK_NETDEV */
 #endif
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index cda25eb..8f829e6 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -276,7 +276,7 @@  struct netdev_class {
 
     /* Pop tunnel header from packet, build tunnel metadata and resize packet
      * for further processing. */
-    int (*pop_header)(struct dp_packet *packet);
+    int (*pop_header)(struct dp_packet *packet, uint32_t rx_ol_flags);
 
     /* Returns the id of the numa node the 'netdev' is on.  If there is no
      * such info, returns NETDEV_NUMA_UNSPEC. */
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index e398562..feae11b 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -873,6 +873,74 @@  ipv6_hdr(void *eth)
 }
 
 static void *
+ip_extract_tnl_md_with_valid_checksum(struct dp_packet *packet,
+                            struct flow_tnl *tnl, unsigned int *hlen)
+{
+    void *nh;
+    struct ip_header *ip;
+    struct ovs_16aligned_ip6_hdr *ip6;
+    void *l4;
+    int l3_size;
+
+    nh = dp_packet_l3(packet);
+    ip = nh;
+    ip6 = nh;
+    l4 = dp_packet_l4(packet);
+
+    if (!nh || !l4) {
+        return NULL;
+    }
+
+    *hlen = sizeof(struct eth_header);
+
+    l3_size = dp_packet_size(packet) -
+                ((char *)nh - (char *)dp_packet_data(packet));
+
+    if (IP_VER(ip->ip_ihl_ver) == 4) {
+
+        ovs_be32 ip_src, ip_dst;
+
+        if (ntohs(ip->ip_tot_len) > l3_size) {
+            VLOG_WARN_RL(&err_rl,
+                         "ip packet is truncated (IP length %d, actual %d)",
+                         ntohs(ip->ip_tot_len), l3_size);
+            return NULL;
+        }
+        if (IP_IHL(ip->ip_ihl_ver) * 4 > sizeof(struct ip_header)) {
+            VLOG_WARN_RL(&err_rl, "ip options not supported on tunnel packets "
+                                  "(%d bytes)", IP_IHL(ip->ip_ihl_ver) * 4);
+            return NULL;
+        }
+
+        ip_src = get_16aligned_be32(&ip->ip_src);
+        ip_dst = get_16aligned_be32(&ip->ip_dst);
+
+        tnl->ip_src = ip_src;
+        tnl->ip_dst = ip_dst;
+        tnl->ip_tos = ip->ip_tos;
+        tnl->ip_ttl = ip->ip_ttl;
+
+        *hlen += IP_HEADER_LEN;
+
+    } else if (IP_VER(ip->ip_ihl_ver) == 6) {
+
+        memcpy(tnl->ipv6_src.s6_addr, ip6->ip6_src.be16, sizeof ip6->ip6_src);
+        memcpy(tnl->ipv6_dst.s6_addr, ip6->ip6_dst.be16, sizeof ip6->ip6_dst);
+        tnl->ip_tos = 0;
+        tnl->ip_ttl = ip6->ip6_hlim;
+
+        *hlen += IPV6_HEADER_LEN;
+
+    } else {
+        VLOG_WARN_RL(&err_rl, "ipv4 packet has invalid version (%d)",
+                                              IP_VER(ip->ip_ihl_ver));
+        return NULL;
+    }
+
+    return l4;
+}
+
+static void *
 ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
                   unsigned int *hlen)
 {
@@ -989,6 +1057,23 @@  push_ip_header(struct dp_packet *packet,
 }
 
 static void *
+udp_extract_tnl_md_with_valid_checksum(struct dp_packet *packet,
+        struct flow_tnl *tnl, unsigned int *hlen)
+{
+    struct udp_header *udp;
+
+    udp = ip_extract_tnl_md_with_valid_checksum(packet, tnl, hlen);
+    if (!udp) {
+        return NULL;
+    }
+    tnl->flags |= FLOW_TNL_F_CSUM;
+    tnl->tp_src = udp->udp_src;
+    tnl->tp_dst = udp->udp_dst;
+
+    return udp + 1;
+}
+
+static void *
 udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
                    unsigned int *hlen)
 {
@@ -1183,7 +1268,7 @@  pkt_metadata_init_tnl(struct pkt_metadata *md)
 }
 
 static int
-netdev_gre_pop_header(struct dp_packet *packet)
+netdev_gre_pop_header(struct dp_packet *packet, uint32_t rx_ol_flags)
 {
     struct pkt_metadata *md = &packet->md;
     struct flow_tnl *tnl = &md->tunnel;
@@ -1280,19 +1365,21 @@  netdev_gre_build_header(const struct netdev *netdev,
 }
 
 static int
-netdev_vxlan_pop_header(struct dp_packet *packet)
+netdev_vxlan_pop_header(struct dp_packet *packet, uint32_t rx_ol_flags)
 {
     struct pkt_metadata *md = &packet->md;
     struct flow_tnl *tnl = &md->tunnel;
     struct vxlanhdr *vxh;
     unsigned int hlen;
 
+    rx_ol_flags = get_checksum_ofld_flags(rx_ol_flags);
     pkt_metadata_init_tnl(md);
     if (VXLAN_HLEN > dp_packet_l4_size(packet)) {
         return EINVAL;
     }
 
-    vxh = udp_extract_tnl_md(packet, tnl, &hlen);
+    vxh = rx_ol_flags ? udp_extract_tnl_md_with_valid_checksum(packet, tnl,
+                        &hlen) : udp_extract_tnl_md(packet, tnl, &hlen);
     if (!vxh) {
         return EINVAL;
     }
@@ -1338,7 +1425,7 @@  netdev_vxlan_build_header(const struct netdev *netdev,
 }
 
 static int
-netdev_geneve_pop_header(struct dp_packet *packet)
+netdev_geneve_pop_header(struct dp_packet *packet, uint32_t rx_ol_flags)
 {
     struct pkt_metadata *md = &packet->md;
     struct flow_tnl *tnl = &md->tunnel;
diff --git a/lib/netdev.c b/lib/netdev.c
index 3e50694..d1494fd 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -767,7 +767,8 @@  netdev_send(struct netdev *netdev, int qid, struct dp_packet **buffers,
 }
 
 int
-netdev_pop_header(struct netdev *netdev, struct dp_packet **buffers, int cnt)
+netdev_pop_header(struct netdev *netdev, struct dp_packet **buffers, int cnt,
+                  uint32_t rx_ol_flags)
 {
     int i;
 
@@ -778,7 +779,7 @@  netdev_pop_header(struct netdev *netdev, struct dp_packet **buffers, int cnt)
     for (i = 0; i < cnt; i++) {
         int err;
 
-        err = netdev->netdev_class->pop_header(buffers[i]);
+        err = netdev->netdev_class->pop_header(buffers[i], rx_ol_flags);
         if (err) {
             dp_packet_clear(buffers[i]);
         }
diff --git a/lib/netdev.h b/lib/netdev.h
index 05968b2..6de0381 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -193,7 +193,7 @@  int netdev_push_header(const struct netdev *netdev,
                        struct dp_packet **buffers, int cnt,
                        const struct ovs_action_push_tnl *data);
 int netdev_pop_header(struct netdev *netdev, struct dp_packet **buffers,
-                      int cnt);
+                      int cnt, uint32_t rx_ol_flags);
 
 /* Hardware address. */
 int netdev_set_etheraddr(struct netdev *, const struct eth_addr mac);
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index b5204b2..06a0c30 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -477,7 +477,7 @@  odp_execute_sample(void *dp, struct dp_packet *packet, bool steal,
     }
 
     odp_execute_actions(dp, &packet, 1, steal, nl_attr_get(subactions),
-                        nl_attr_get_size(subactions), dp_execute_action);
+                        nl_attr_get_size(subactions), dp_execute_action, 0);
 }
 
 static bool
@@ -516,7 +516,7 @@  requires_datapath_assistance(const struct nlattr *a)
 void
 odp_execute_actions(void *dp, struct dp_packet **packets, int cnt, bool steal,
                     const struct nlattr *actions, size_t actions_len,
-                    odp_execute_cb dp_execute_action)
+                    odp_execute_cb dp_execute_action, uint32_t rx_ol_flags)
 {
     const struct nlattr *a;
     unsigned int left;
@@ -532,7 +532,7 @@  odp_execute_actions(void *dp, struct dp_packet **packets, int cnt, bool steal,
                  * not need it any more. */
                 bool may_steal = steal && last_action;
 
-                dp_execute_action(dp, packets, cnt, a, may_steal);
+                dp_execute_action(dp, packets, cnt, a, may_steal, rx_ol_flags);
 
                 if (last_action) {
                     /* We do not need to free the packets. dp_execute_actions()
diff --git a/lib/odp-execute.h b/lib/odp-execute.h
index c602bb4..49e587b 100644
--- a/lib/odp-execute.h
+++ b/lib/odp-execute.h
@@ -28,7 +28,8 @@  struct dp_packet;
 struct pkt_metadata;
 
 typedef void (*odp_execute_cb)(void *dp, struct dp_packet **packets, int cnt,
-                               const struct nlattr *action, bool may_steal);
+                               const struct nlattr *action, bool may_steal,
+                               uint32_t rx_ol_flags);
 
 /* Actions that need to be executed in the context of a datapath are handed
  * to 'dp_execute_action', if non-NULL.  Currently this is called only for
@@ -37,5 +38,6 @@  typedef void (*odp_execute_cb)(void *dp, struct dp_packet **packets, int cnt,
 void odp_execute_actions(void *dp, struct dp_packet **packets, int cnt,
                          bool steal,
                          const struct nlattr *actions, size_t actions_len,
-                         odp_execute_cb dp_execute_action);
+                         odp_execute_cb dp_execute_action,
+                         uint32_t rx_ol_flags);
 #endif
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index a02dc24..848a94a 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -3593,7 +3593,8 @@  execute_controller_action(struct xlate_ctx *ctx, int len,
     packet = dp_packet_clone(ctx->xin->packet);
 
     odp_execute_actions(NULL, &packet, 1, false,
-                        ctx->odp_actions->data, ctx->odp_actions->size, NULL);
+                        ctx->odp_actions->data, ctx->odp_actions->size, NULL,
+                        0);
 
     /* A packet sent by an action in a table-miss rule is considered an
      * explicit table miss.  OpenFlow before 1.3 doesn't have that concept so