diff mbox series

[ovs-dev,v2] Userspace: Software fallback for UDP encapsulated TCP segmentation.

Message ID 20240613025401.4089319-1-mkp@redhat.com
State Changes Requested, archived
Headers show
Series [ovs-dev,v2] Userspace: Software fallback for UDP encapsulated TCP segmentation. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Mike Pattrick June 13, 2024, 2:54 a.m. UTC
When sending packets that are flagged as requiring segmentation to an
interface that doens't support this feature, send the packet to the TSO
software fallback instead of dropping it.

Signed-off-by: Mike Pattrick <mkp@redhat.com>

---
v2:
 - Fixed udp tunnel length
 - Added test that UDP headers are correct
 - Split inner and outer ip_id into different counters
 - Set tunnel flags in reset_tcp_seg

Signed-off-by: Mike Pattrick <mkp@redhat.com>
---
 lib/dp-packet-gso.c     | 89 +++++++++++++++++++++++++++++++++--------
 lib/dp-packet.h         | 34 ++++++++++++++++
 lib/netdev-native-tnl.c |  8 ++++
 lib/netdev.c            | 37 +++++++----------
 tests/system-traffic.at | 87 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 216 insertions(+), 39 deletions(-)

Comments

David Marchand June 21, 2024, 3:07 p.m. UTC | #1
Hi Mike,

On Thu, Jun 13, 2024 at 4:54 AM Mike Pattrick <mkp@redhat.com> wrote:
>
> When sending packets that are flagged as requiring segmentation to an
> interface that doens't support this feature, send the packet to the TSO
> software fallback instead of dropping it.
>
> Signed-off-by: Mike Pattrick <mkp@redhat.com>

I started testing this patch (still not finished), I have some
questions and comments.

>
> ---
> v2:
>  - Fixed udp tunnel length
>  - Added test that UDP headers are correct
>  - Split inner and outer ip_id into different counters
>  - Set tunnel flags in reset_tcp_seg
>
> Signed-off-by: Mike Pattrick <mkp@redhat.com>
> ---
>  lib/dp-packet-gso.c     | 89 +++++++++++++++++++++++++++++++++--------
>  lib/dp-packet.h         | 34 ++++++++++++++++
>  lib/netdev-native-tnl.c |  8 ++++
>  lib/netdev.c            | 37 +++++++----------
>  tests/system-traffic.at | 87 ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 216 insertions(+), 39 deletions(-)
>
> diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> index 847685ad9..dc43ad662 100644
> --- a/lib/dp-packet-gso.c
> +++ b/lib/dp-packet-gso.c
> @@ -47,6 +47,8 @@ dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
>      seg->l2_5_ofs = p->l2_5_ofs;
>      seg->l3_ofs = p->l3_ofs;
>      seg->l4_ofs = p->l4_ofs;
> +    seg->inner_l3_ofs = p->inner_l3_ofs;
> +    seg->inner_l4_ofs = p->inner_l4_ofs;
>
>      /* The protocol headers remain the same, so preserve hash and mark. */
>      *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p);
> @@ -71,7 +73,12 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
>      const char *data_tail;
>      const char *data_pos;
>
> -    data_pos = dp_packet_get_tcp_payload(p);
> +    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
> +        dp_packet_hwol_is_tunnel_geneve(p)) {
> +        data_pos = dp_packet_get_inner_tcp_payload(p);
> +    } else {
> +        data_pos = dp_packet_get_tcp_payload(p);
> +    }
>      data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
>
>      return DIV_ROUND_UP(data_tail - data_pos, segsz);
> @@ -89,14 +96,19 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
>      static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>      struct dp_packet_batch *curr_batch = *batches;
>      struct tcp_header *tcp_hdr;
> +    struct udp_header *tnl_hdr;
>      struct ip_header *ip_hdr;
> +    uint16_t inner_ip_id = 0;
> +    uint16_t outer_ip_id = 0;
>      struct dp_packet *seg;
> +    const char *data_pos;
>      uint16_t tcp_offset;
>      uint16_t tso_segsz;
>      uint32_t tcp_seq;
> -    uint16_t ip_id;
> +    bool outer_ipv4;
>      int hdr_len;
>      int seg_len;
> +    bool tnl;
>
>      tso_segsz = dp_packet_get_tso_segsz(p);
>      if (!tso_segsz) {
> @@ -105,20 +117,38 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
>          return false;
>      }
>
> -    tcp_hdr = dp_packet_l4(p);
> -    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> -    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> -    hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
> -              + tcp_offset * 4;
> -    ip_id = 0;
> -    if (dp_packet_hwol_is_ipv4(p)) {
> +    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
> +            dp_packet_hwol_is_tunnel_geneve(p)) {
> +        data_pos =  dp_packet_get_inner_tcp_payload(p);
> +        outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p);
> +        tcp_hdr = dp_packet_inner_l4(p);
> +        ip_hdr = dp_packet_inner_l3(p);
> +        tnl = true;
> +
> +        if (outer_ipv4) {
> +            outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id);
> +        }
> +        if (dp_packet_hwol_is_ipv4(p)) {
> +            inner_ip_id = ntohs(ip_hdr->ip_id);
> +        }
> +    } else {
> +        data_pos = dp_packet_get_tcp_payload(p);
> +        outer_ipv4 = dp_packet_hwol_is_ipv4(p);
> +        tcp_hdr = dp_packet_l4(p);
>          ip_hdr = dp_packet_l3(p);
> -        ip_id = ntohs(ip_hdr->ip_id);
> +        tnl = false;
> +
> +        if (outer_ipv4) {
> +            outer_ip_id = ntohs(ip_hdr->ip_id);
> +        }
>      }
>
> +    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> +    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> +    hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p))
> +              + tcp_offset * 4;
>      const char *data_tail = (char *) dp_packet_tail(p)
>                              - dp_packet_l2_pad_size(p);
> -    const char *data_pos = dp_packet_get_tcp_payload(p);
>      int n_segs = dp_packet_gso_nr_segs(p);
>
>      for (int i = 0; i < n_segs; i++) {
> @@ -130,14 +160,35 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
>          seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
>          data_pos += seg_len;
>
> +        if (tnl) {
> +            /* Update tunnel inner L3 header. */
> +            if (dp_packet_hwol_is_ipv4(seg)) {
> +                ip_hdr = dp_packet_inner_l3(seg);
> +                ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> +                                           dp_packet_inner_l4_size(seg));
> +                ip_hdr->ip_id = htons(inner_ip_id);
> +                ip_hdr->ip_csum = 0;
> +                inner_ip_id++;
> +            } else {
> +                struct ovs_16aligned_ip6_hdr *ip6_hdr;
> +
> +                ip6_hdr = dp_packet_inner_l3(seg);
> +                ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
> +                    = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
> +            }
> +        }
> +
>          /* Update L3 header. */
> -        if (dp_packet_hwol_is_ipv4(seg)) {
> +        if (outer_ipv4) {
> +            uint16_t ip_tot_len;
>              ip_hdr = dp_packet_l3(seg);
> -            ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> -                                       dp_packet_l4_size(seg));
> -            ip_hdr->ip_id = htons(ip_id);
> +            tnl_hdr = dp_packet_l4(seg);
> +            ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg);
> +            ip_hdr->ip_tot_len = htons(ip_tot_len);
> +            ip_hdr->ip_id = htons(outer_ip_id);
>              ip_hdr->ip_csum = 0;
> -            ip_id++;
> +            tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN);

IIUC, this part is common for non-tunneled and tunneled packets but it
does not make sense to update tnl_hdr->udp_len for non-tunneled
packets.
Plus, udp_len is not adjusted in the ipv6 case so the announced length
is that of the original TSO frame.

udp_len update should probably move in the if (tnl) { previous block.
Something like:
@@ -176,18 +177,19 @@ dp_packet_gso(struct dp_packet *p, struct
dp_packet_batch **batches)
                 ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
                     = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
             }
+
+            tnl_hdr = dp_packet_l4(seg);
+            tnl_hdr->udp_len = htons(dp_packet_l4_size(seg));
         }

         /* Update L3 header. */

> +            outer_ip_id++;
>          } else {
>              struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
>
> @@ -146,7 +197,11 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
>          }
>
>          /* Update L4 header. */
> -        tcp_hdr = dp_packet_l4(seg);
> +        if (tnl) {
> +            tcp_hdr = dp_packet_inner_l4(seg);
> +        } else {
> +            tcp_hdr = dp_packet_l4(seg);
> +        }
>          put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
>          tcp_seq += seg_len;
>          if (OVS_LIKELY(i < (n_segs - 1))) {
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index a75b1c5cd..af1ff088f 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -529,6 +529,16 @@ dp_packet_inner_l3(const struct dp_packet *b)
>             : NULL;
>  }
>
> +static inline size_t
> +dp_packet_inner_l3_size(const struct dp_packet *b)
> +{
> +    return OVS_LIKELY(b->inner_l3_ofs != UINT16_MAX)
> +           ? (const char *) dp_packet_tail(b)
> +           - (const char *) dp_packet_inner_l3(b)
> +           - dp_packet_l2_pad_size(b)
> +           : 0;
> +}
> +
>  static inline void *
>  dp_packet_inner_l4(const struct dp_packet *b)
>  {
> @@ -563,6 +573,22 @@ dp_packet_get_tcp_payload(const struct dp_packet *b)
>      return NULL;
>  }
>
> +static inline const void *
> +dp_packet_get_inner_tcp_payload(const struct dp_packet *b)
> +{
> +    size_t l4_size = dp_packet_inner_l4_size(b);
> +
> +    if (OVS_LIKELY(l4_size >= TCP_HEADER_LEN)) {
> +        struct tcp_header *tcp = dp_packet_inner_l4(b);
> +        int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
> +
> +        if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= l4_size)) {
> +            return (const char *) tcp + tcp_len;
> +        }
> +    }
> +    return NULL;
> +}
> +
>  static inline uint32_t
>  dp_packet_get_tcp_payload_length(const struct dp_packet *pkt)
>  {
> @@ -1320,6 +1346,14 @@ dp_packet_hwol_reset_tcp_seg(struct dp_packet *p)
>          ol_flags |= DP_PACKET_OL_TX_IP_CKSUM;
>      }
>
> +    if (ol_flags & (DP_PACKET_OL_TX_TUNNEL_VXLAN |
> +                    DP_PACKET_OL_TX_TUNNEL_GENEVE)) {
> +        if (ol_flags & DP_PACKET_OL_TX_OUTER_IPV4) {
> +            ol_flags |= DP_PACKET_OL_TX_OUTER_IP_CKSUM;
> +        }
> +        ol_flags |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM;
> +    }
> +
>      *dp_packet_ol_flags_ptr(p) = ol_flags;
>  }
>
> diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
> index 0f9f07f44..22cc858e7 100644
> --- a/lib/netdev-native-tnl.c
> +++ b/lib/netdev-native-tnl.c
> @@ -308,6 +308,14 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED,
>      if (l4_ofs != UINT16_MAX) {
>          packet->inner_l4_ofs = l4_ofs + data->header_len;
>      }
> +
> +    if (dp_packet_hwol_is_tso(packet)) {
> +        uint16_t tso_segsz = dp_packet_get_tso_segsz(packet);
> +        if (tso_segsz > data->header_len) {
> +            tso_segsz -= data->header_len;
> +            dp_packet_set_tso_segsz(packet, tso_segsz);
> +        }
> +    }
>  }
>
>  static void *

Is this hunk a fix valid even for hw-assisted tso?
If so, I would make it a separate patch.


> diff --git a/lib/netdev.c b/lib/netdev.c
> index f2d921ed6..1d59bbe5d 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -69,8 +69,6 @@ COVERAGE_DEFINE(netdev_received);
>  COVERAGE_DEFINE(netdev_sent);
>  COVERAGE_DEFINE(netdev_add_router);
>  COVERAGE_DEFINE(netdev_get_stats);
> -COVERAGE_DEFINE(netdev_vxlan_tso_drops);
> -COVERAGE_DEFINE(netdev_geneve_tso_drops);
>  COVERAGE_DEFINE(netdev_push_header_drops);
>  COVERAGE_DEFINE(netdev_soft_seg_good);
>  COVERAGE_DEFINE(netdev_soft_seg_drops);
> @@ -910,28 +908,23 @@ netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
>      struct dp_packet *packet;
>      int error;
>
> -    if (userspace_tso_enabled() &&
> -        !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> -        DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> -            if (dp_packet_hwol_is_tso(packet)) {
> -                if (dp_packet_hwol_is_tunnel_vxlan(packet)
> -                    && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) {
> -                        VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support",
> -                                     netdev_get_name(netdev));
> -                        COVERAGE_INC(netdev_vxlan_tso_drops);
> -                        dp_packet_delete_batch(batch, true);
> -                        return false;
> +    if (userspace_tso_enabled()) {
> +        if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> +            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +                if (dp_packet_hwol_is_tso(packet)) {
> +                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
>                  }
> -
> -                if (dp_packet_hwol_is_tunnel_geneve(packet)
> -                    && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) {
> -                        VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support",
> -                                     netdev_get_name(netdev));
> -                        COVERAGE_INC(netdev_geneve_tso_drops);
> -                        dp_packet_delete_batch(batch, true);
> -                        return false;
> +            }
> +        } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
> +                                     NETDEV_TX_GENEVE_TNL_TSO))) {

Only checking those flags does not seem to work for me.
But this seems due to a driver bug.

Example on my system, with CX-5:

# ovs-vsctl get interface dpdk0 status | tr ',' '\n'
{bus_info="bus_name=pci
 vendor_id=15b3
 device_id=1019"
 driver_name=mlx5_pci
 if_descr="DPDK 23.11.0 mlx5_pci"
 if_type="6"
 link_speed="100Gbps"
 max_hash_mac_addrs="0"
 max_mac_addrs="128"
 max_rx_pktlen="1518"
 max_rx_queues="1024"
 max_tx_queues="1024"
 max_vfs="0"
 max_vmdq_pools="0"
 min_rx_bufsize="32"
 n_rxq="1"
 n_txq="5"
 numa_id="1"
 port_no="2"
 rx-steering=rss
 rx_csum_offload="true"
 tx_geneve_tso_offload="true"
 ^^^

 tx_ip_csum_offload="true"
 tx_out_ip_csum_offload="true"
 tx_out_udp_csum_offload="false"
 ^^^

 tx_sctp_csum_offload="false"
 tx_tcp_csum_offload="true"
 tx_tcp_seg_offload="true"
 tx_udp_csum_offload="true"
 tx_vxlan_tso_offload="true"}
 ^^^

net/mlx5 is claiming (geneve|vxlan)_tso_offload while not supporting
outer udp checksum.

I added the hunk below for now:

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 3cdcf8f8d..077768832 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1413,14 +1413,16 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
                       netdev_get_name(&dev->up));
         }

-        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
+        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM &&
+            info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
             dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
         } else {
             VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
                       netdev_get_name(&dev->up));
         }

-        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
+        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM &&
+            info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
             dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
         } else {
             VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",


> +            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +                if (!dp_packet_hwol_is_tso(packet)) {
> +                    continue;
> +                }
> +                if (dp_packet_hwol_is_tunnel_vxlan(packet) ||
> +                    dp_packet_hwol_is_tunnel_geneve(packet)) {
> +                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
>                  }
> -                return netdev_send_tso(netdev, qid, batch, concurrent_txq);
>              }
>          }
>      }
David Marchand June 24, 2024, 11:49 a.m. UTC | #2
Some additional comments.

On Thu, Jun 13, 2024 at 4:54 AM Mike Pattrick <mkp@redhat.com> wrote:
> @@ -89,14 +96,19 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
>      static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>      struct dp_packet_batch *curr_batch = *batches;
>      struct tcp_header *tcp_hdr;
> +    struct udp_header *tnl_hdr;
>      struct ip_header *ip_hdr;
> +    uint16_t inner_ip_id = 0;
> +    uint16_t outer_ip_id = 0;
>      struct dp_packet *seg;
> +    const char *data_pos;
>      uint16_t tcp_offset;
>      uint16_t tso_segsz;
>      uint32_t tcp_seq;
> -    uint16_t ip_id;
> +    bool outer_ipv4;
>      int hdr_len;
>      int seg_len;
> +    bool tnl;
>
>      tso_segsz = dp_packet_get_tso_segsz(p);
>      if (!tso_segsz) {
> @@ -105,20 +117,38 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
>          return false;
>      }
>
> -    tcp_hdr = dp_packet_l4(p);
> -    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> -    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> -    hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
> -              + tcp_offset * 4;
> -    ip_id = 0;
> -    if (dp_packet_hwol_is_ipv4(p)) {
> +    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
> +            dp_packet_hwol_is_tunnel_geneve(p)) {
> +        data_pos =  dp_packet_get_inner_tcp_payload(p);
> +        outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p);
> +        tcp_hdr = dp_packet_inner_l4(p);
> +        ip_hdr = dp_packet_inner_l3(p);
> +        tnl = true;
> +
> +        if (outer_ipv4) {
> +            outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id);
> +        }
> +        if (dp_packet_hwol_is_ipv4(p)) {
> +            inner_ip_id = ntohs(ip_hdr->ip_id);
> +        }
> +    } else {
> +        data_pos = dp_packet_get_tcp_payload(p);
> +        outer_ipv4 = dp_packet_hwol_is_ipv4(p);
> +        tcp_hdr = dp_packet_l4(p);
>          ip_hdr = dp_packet_l3(p);
> -        ip_id = ntohs(ip_hdr->ip_id);
> +        tnl = false;
> +
> +        if (outer_ipv4) {
> +            outer_ip_id = ntohs(ip_hdr->ip_id);
> +        }
>      }
>
> +    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> +    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> +    hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p))
> +              + tcp_offset * 4;
>      const char *data_tail = (char *) dp_packet_tail(p)
>                              - dp_packet_l2_pad_size(p);
> -    const char *data_pos = dp_packet_get_tcp_payload(p);

Not a strong opinion but data_pos init could be kept here.
const char *data_pos = (char *) tcp_hdr + tcp_offset * 4;


>      int n_segs = dp_packet_gso_nr_segs(p);
>
>      for (int i = 0; i < n_segs; i++) {
> @@ -130,14 +160,35 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
>          seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
>          data_pos += seg_len;
>
> +        if (tnl) {
> +            /* Update tunnel inner L3 header. */
> +            if (dp_packet_hwol_is_ipv4(seg)) {
> +                ip_hdr = dp_packet_inner_l3(seg);
> +                ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> +                                           dp_packet_inner_l4_size(seg));

ip_hdr->ip_tot_len = htons(dp_packet_inner_l3_size(seg)); is easier to
read (and more consistent with the ipv6 block that follows).


> +                ip_hdr->ip_id = htons(inner_ip_id);
> +                ip_hdr->ip_csum = 0;
> +                inner_ip_id++;
> +            } else {
> +                struct ovs_16aligned_ip6_hdr *ip6_hdr;
> +
> +                ip6_hdr = dp_packet_inner_l3(seg);
> +                ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
> +                    = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
> +            }
> +        }
> +
>          /* Update L3 header. */
> -        if (dp_packet_hwol_is_ipv4(seg)) {
> +        if (outer_ipv4) {
> +            uint16_t ip_tot_len;
>              ip_hdr = dp_packet_l3(seg);
> -            ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> -                                       dp_packet_l4_size(seg));
> -            ip_hdr->ip_id = htons(ip_id);
> +            tnl_hdr = dp_packet_l4(seg);
> +            ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg);

ip_hdr->ip_tot_len = htons(dp_packet_l3_size(seg));


> +            ip_hdr->ip_tot_len = htons(ip_tot_len);
> +            ip_hdr->ip_id = htons(outer_ip_id);
>              ip_hdr->ip_csum = 0;
> -            ip_id++;
> +            tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN);
> +            outer_ip_id++;
>          } else {
>              struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
>
David Marchand June 27, 2024, 2:04 p.m. UTC | #3
On Fri, Jun 21, 2024 at 5:07 PM David Marchand
<david.marchand@redhat.com> wrote:
> net/mlx5 is claiming (geneve|vxlan)_tso_offload while not supporting
> outer udp checksum.

This may be a set of capabilities we can use if, for example, no udp
checksum is requested.

How about adding:
# git diff
diff --git a/lib/netdev.c b/lib/netdev.c
index 1d59bbe5d..429c86a22 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -926,6 +926,17 @@ netdev_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch,
                     return netdev_send_tso(netdev, qid, batch, concurrent_txq);
                 }
             }
+        } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
+            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+                if (!dp_packet_hwol_is_tso(packet) &&
+                    !dp_packet_hwol_is_tunnel_vxlan(packet) &&
+                    !dp_packet_hwol_is_tunnel_geneve(packet)) {
+                    continue;
+                }
+                if (dp_packet_hwol_is_outer_udp_cksum(packet)) {
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+                }
+            }
         }
     }

Here are some numbers on mlx5, with the udp length fix I mentionned
previously + this hunk above:

# Sw GSO when csum enabled
- ovs-vsctl del-port tunnel0 -- add-port br-int tunnel0 -- set
interface tunnel0 type=vxlan options:remote_ip=172.31.3.1
options:key=0 options:csum=true

Connecting to host 172.31.2.2, port 5201
Reverse mode, remote host 172.31.2.2 is sending
[  5] local 172.31.2.1 port 58748 connected to 172.31.2.2 port 5201
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec   580 MBytes  4.87 Gbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-1.04   sec   582 MBytes  4.70 Gbits/sec    0             sender
[  5]   0.00-1.00   sec   580 MBytes  4.87 Gbits/sec                  receiver

iperf Done.

# Hw GSO when csum disabled
- ovs-vsctl del-port tunnel0 -- add-port br-int tunnel0 -- set
interface tunnel0 type=vxlan options:remote_ip=172.31.3.1
options:key=0 options:csum=false

Connecting to host 172.31.2.2, port 5201
Reverse mode, remote host 172.31.2.2 is sending
[  5] local 172.31.2.1 port 39080 connected to 172.31.2.2 port 5201
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  1.50 GBytes  12.9 Gbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-1.04   sec  1.50 GBytes  12.4 Gbits/sec  213             sender
[  5]   0.00-1.00   sec  1.50 GBytes  12.9 Gbits/sec                  receiver

iperf Done.
David Marchand June 28, 2024, 8:50 a.m. UTC | #4
On Thu, Jun 27, 2024 at 4:04 PM David Marchand
<david.marchand@redhat.com> wrote:
> @@ -926,6 +926,17 @@ netdev_send(struct netdev *netdev, int qid,
> struct dp_packet_batch *batch,
>                      return netdev_send_tso(netdev, qid, batch, concurrent_txq);
>                  }
>              }
> +        } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
> +            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +                if (!dp_packet_hwol_is_tso(packet) &&
> +                    !dp_packet_hwol_is_tunnel_vxlan(packet) &&
> +                    !dp_packet_hwol_is_tunnel_geneve(packet)) {
> +                    continue;

Erm, less buggy:

+                if (!dp_packet_hwol_is_tso(packet) ||
+                    (!dp_packet_hwol_is_tunnel_vxlan(packet) &&
+                     !dp_packet_hwol_is_tunnel_geneve(packet))) {
+                    continue;
+                }

> +                }
> +                if (dp_packet_hwol_is_outer_udp_cksum(packet)) {
> +                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
> +                }
> +            }
>          }
>      }
Mike Pattrick June 28, 2024, 7:51 p.m. UTC | #5
On Fri, Jun 28, 2024 at 4:50 AM David Marchand
<david.marchand@redhat.com> wrote:
>
> On Thu, Jun 27, 2024 at 4:04 PM David Marchand
> <david.marchand@redhat.com> wrote:
> > @@ -926,6 +926,17 @@ netdev_send(struct netdev *netdev, int qid,
> > struct dp_packet_batch *batch,
> >                      return netdev_send_tso(netdev, qid, batch, concurrent_txq);
> >                  }
> >              }
> > +        } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
> > +            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +                if (!dp_packet_hwol_is_tso(packet) &&
> > +                    !dp_packet_hwol_is_tunnel_vxlan(packet) &&
> > +                    !dp_packet_hwol_is_tunnel_geneve(packet)) {
> > +                    continue;
>
> Erm, less buggy:

Thanks for the review David! These are some good ideas. I'll send in a
v3 with them included.


Cheers,
M

>
> +                if (!dp_packet_hwol_is_tso(packet) ||
> +                    (!dp_packet_hwol_is_tunnel_vxlan(packet) &&
> +                     !dp_packet_hwol_is_tunnel_geneve(packet))) {
> +                    continue;
> +                }
>
> > +                }
> > +                if (dp_packet_hwol_is_outer_udp_cksum(packet)) {
> > +                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
> > +                }
> > +            }
> >          }
> >      }
>
>
> --
> David Marchand
>
Mike Pattrick July 4, 2024, 4:48 p.m. UTC | #6
On Fri, Jun 21, 2024 at 11:08 AM David Marchand
<david.marchand@redhat.com> wrote:
>
> Hi Mike,
>
> On Thu, Jun 13, 2024 at 4:54 AM Mike Pattrick <mkp@redhat.com> wrote:
> >
> > When sending packets that are flagged as requiring segmentation to an
> > interface that doens't support this feature, send the packet to the TSO
> > software fallback instead of dropping it.
> >
> > Signed-off-by: Mike Pattrick <mkp@redhat.com>
>
> I started testing this patch (still not finished), I have some
> questions and comments.
>
> >
> > ---
> > v2:
> >  - Fixed udp tunnel length
> >  - Added test that UDP headers are correct
> >  - Split inner and outer ip_id into different counters
> >  - Set tunnel flags in reset_tcp_seg
> >
> > Signed-off-by: Mike Pattrick <mkp@redhat.com>
> > ---
> >  lib/dp-packet-gso.c     | 89 +++++++++++++++++++++++++++++++++--------
> >  lib/dp-packet.h         | 34 ++++++++++++++++
> >  lib/netdev-native-tnl.c |  8 ++++
> >  lib/netdev.c            | 37 +++++++----------
> >  tests/system-traffic.at | 87 ++++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 216 insertions(+), 39 deletions(-)
> >
> > diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> > index 847685ad9..dc43ad662 100644
> > --- a/lib/dp-packet-gso.c
> > +++ b/lib/dp-packet-gso.c
> > @@ -47,6 +47,8 @@ dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
> >      seg->l2_5_ofs = p->l2_5_ofs;
> >      seg->l3_ofs = p->l3_ofs;
> >      seg->l4_ofs = p->l4_ofs;
> > +    seg->inner_l3_ofs = p->inner_l3_ofs;
> > +    seg->inner_l4_ofs = p->inner_l4_ofs;
> >
> >      /* The protocol headers remain the same, so preserve hash and mark. */
> >      *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p);
> > @@ -71,7 +73,12 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
> >      const char *data_tail;
> >      const char *data_pos;
> >
> > -    data_pos = dp_packet_get_tcp_payload(p);
> > +    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
> > +        dp_packet_hwol_is_tunnel_geneve(p)) {
> > +        data_pos = dp_packet_get_inner_tcp_payload(p);
> > +    } else {
> > +        data_pos = dp_packet_get_tcp_payload(p);
> > +    }
> >      data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
> >
> >      return DIV_ROUND_UP(data_tail - data_pos, segsz);
> > @@ -89,14 +96,19 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> >      static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> >      struct dp_packet_batch *curr_batch = *batches;
> >      struct tcp_header *tcp_hdr;
> > +    struct udp_header *tnl_hdr;
> >      struct ip_header *ip_hdr;
> > +    uint16_t inner_ip_id = 0;
> > +    uint16_t outer_ip_id = 0;
> >      struct dp_packet *seg;
> > +    const char *data_pos;
> >      uint16_t tcp_offset;
> >      uint16_t tso_segsz;
> >      uint32_t tcp_seq;
> > -    uint16_t ip_id;
> > +    bool outer_ipv4;
> >      int hdr_len;
> >      int seg_len;
> > +    bool tnl;
> >
> >      tso_segsz = dp_packet_get_tso_segsz(p);
> >      if (!tso_segsz) {
> > @@ -105,20 +117,38 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> >          return false;
> >      }
> >
> > -    tcp_hdr = dp_packet_l4(p);
> > -    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> > -    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> > -    hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
> > -              + tcp_offset * 4;
> > -    ip_id = 0;
> > -    if (dp_packet_hwol_is_ipv4(p)) {
> > +    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
> > +            dp_packet_hwol_is_tunnel_geneve(p)) {
> > +        data_pos =  dp_packet_get_inner_tcp_payload(p);
> > +        outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p);
> > +        tcp_hdr = dp_packet_inner_l4(p);
> > +        ip_hdr = dp_packet_inner_l3(p);
> > +        tnl = true;
> > +
> > +        if (outer_ipv4) {
> > +            outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id);
> > +        }
> > +        if (dp_packet_hwol_is_ipv4(p)) {
> > +            inner_ip_id = ntohs(ip_hdr->ip_id);
> > +        }
> > +    } else {
> > +        data_pos = dp_packet_get_tcp_payload(p);
> > +        outer_ipv4 = dp_packet_hwol_is_ipv4(p);
> > +        tcp_hdr = dp_packet_l4(p);
> >          ip_hdr = dp_packet_l3(p);
> > -        ip_id = ntohs(ip_hdr->ip_id);
> > +        tnl = false;
> > +
> > +        if (outer_ipv4) {
> > +            outer_ip_id = ntohs(ip_hdr->ip_id);
> > +        }
> >      }
> >
> > +    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> > +    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> > +    hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p))
> > +              + tcp_offset * 4;
> >      const char *data_tail = (char *) dp_packet_tail(p)
> >                              - dp_packet_l2_pad_size(p);
> > -    const char *data_pos = dp_packet_get_tcp_payload(p);
> >      int n_segs = dp_packet_gso_nr_segs(p);
> >
> >      for (int i = 0; i < n_segs; i++) {
> > @@ -130,14 +160,35 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> >          seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
> >          data_pos += seg_len;
> >
> > +        if (tnl) {
> > +            /* Update tunnel inner L3 header. */
> > +            if (dp_packet_hwol_is_ipv4(seg)) {
> > +                ip_hdr = dp_packet_inner_l3(seg);
> > +                ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> > +                                           dp_packet_inner_l4_size(seg));
> > +                ip_hdr->ip_id = htons(inner_ip_id);
> > +                ip_hdr->ip_csum = 0;
> > +                inner_ip_id++;
> > +            } else {
> > +                struct ovs_16aligned_ip6_hdr *ip6_hdr;
> > +
> > +                ip6_hdr = dp_packet_inner_l3(seg);
> > +                ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
> > +                    = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
> > +            }
> > +        }
> > +
> >          /* Update L3 header. */
> > -        if (dp_packet_hwol_is_ipv4(seg)) {
> > +        if (outer_ipv4) {
> > +            uint16_t ip_tot_len;
> >              ip_hdr = dp_packet_l3(seg);
> > -            ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> > -                                       dp_packet_l4_size(seg));
> > -            ip_hdr->ip_id = htons(ip_id);
> > +            tnl_hdr = dp_packet_l4(seg);
> > +            ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg);
> > +            ip_hdr->ip_tot_len = htons(ip_tot_len);
> > +            ip_hdr->ip_id = htons(outer_ip_id);
> >              ip_hdr->ip_csum = 0;
> > -            ip_id++;
> > +            tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN);
>
> IIUC, this part is common for non-tunneled and tunneled packets but it
> does not make sense to update tnl_hdr->udp_len for non-tunneled
> packets.
> Plus, udp_len is not adjusted in the ipv6 case so the announced length
> is that of the original TSO frame.
>
> udp_len update should probably move in the if (tnl) { previous block.
> Something like:
> @@ -176,18 +177,19 @@ dp_packet_gso(struct dp_packet *p, struct
> dp_packet_batch **batches)
>                  ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
>                      = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
>              }
> +
> +            tnl_hdr = dp_packet_l4(seg);
> +            tnl_hdr->udp_len = htons(dp_packet_l4_size(seg));
>          }
>
>          /* Update L3 header. */
>
> > +            outer_ip_id++;
> >          } else {
> >              struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
> >
> > @@ -146,7 +197,11 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> >          }
> >
> >          /* Update L4 header. */
> > -        tcp_hdr = dp_packet_l4(seg);
> > +        if (tnl) {
> > +            tcp_hdr = dp_packet_inner_l4(seg);
> > +        } else {
> > +            tcp_hdr = dp_packet_l4(seg);
> > +        }
> >          put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
> >          tcp_seq += seg_len;
> >          if (OVS_LIKELY(i < (n_segs - 1))) {
> > diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> > index a75b1c5cd..af1ff088f 100644
> > --- a/lib/dp-packet.h
> > +++ b/lib/dp-packet.h
> > @@ -529,6 +529,16 @@ dp_packet_inner_l3(const struct dp_packet *b)
> >             : NULL;
> >  }
> >
> > +static inline size_t
> > +dp_packet_inner_l3_size(const struct dp_packet *b)
> > +{
> > +    return OVS_LIKELY(b->inner_l3_ofs != UINT16_MAX)
> > +           ? (const char *) dp_packet_tail(b)
> > +           - (const char *) dp_packet_inner_l3(b)
> > +           - dp_packet_l2_pad_size(b)
> > +           : 0;
> > +}
> > +
> >  static inline void *
> >  dp_packet_inner_l4(const struct dp_packet *b)
> >  {
> > @@ -563,6 +573,22 @@ dp_packet_get_tcp_payload(const struct dp_packet *b)
> >      return NULL;
> >  }
> >
> > +static inline const void *
> > +dp_packet_get_inner_tcp_payload(const struct dp_packet *b)
> > +{
> > +    size_t l4_size = dp_packet_inner_l4_size(b);
> > +
> > +    if (OVS_LIKELY(l4_size >= TCP_HEADER_LEN)) {
> > +        struct tcp_header *tcp = dp_packet_inner_l4(b);
> > +        int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
> > +
> > +        if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= l4_size)) {
> > +            return (const char *) tcp + tcp_len;
> > +        }
> > +    }
> > +    return NULL;
> > +}
> > +
> >  static inline uint32_t
> >  dp_packet_get_tcp_payload_length(const struct dp_packet *pkt)
> >  {
> > @@ -1320,6 +1346,14 @@ dp_packet_hwol_reset_tcp_seg(struct dp_packet *p)
> >          ol_flags |= DP_PACKET_OL_TX_IP_CKSUM;
> >      }
> >
> > +    if (ol_flags & (DP_PACKET_OL_TX_TUNNEL_VXLAN |
> > +                    DP_PACKET_OL_TX_TUNNEL_GENEVE)) {
> > +        if (ol_flags & DP_PACKET_OL_TX_OUTER_IPV4) {
> > +            ol_flags |= DP_PACKET_OL_TX_OUTER_IP_CKSUM;
> > +        }
> > +        ol_flags |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM;
> > +    }
> > +
> >      *dp_packet_ol_flags_ptr(p) = ol_flags;
> >  }
> >
> > diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
> > index 0f9f07f44..22cc858e7 100644
> > --- a/lib/netdev-native-tnl.c
> > +++ b/lib/netdev-native-tnl.c
> > @@ -308,6 +308,14 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED,
> >      if (l4_ofs != UINT16_MAX) {
> >          packet->inner_l4_ofs = l4_ofs + data->header_len;
> >      }
> > +
> > +    if (dp_packet_hwol_is_tso(packet)) {
> > +        uint16_t tso_segsz = dp_packet_get_tso_segsz(packet);
> > +        if (tso_segsz > data->header_len) {
> > +            tso_segsz -= data->header_len;
> > +            dp_packet_set_tso_segsz(packet, tso_segsz);
> > +        }
> > +    }
> >  }
> >
> >  static void *
>
> Is this hunk a fix valid even for hw-assisted tso?
> If so, I would make it a separate patch.

I just realized that I had forgotten to extract this into a seperate
patch. This code is actually over a year old but now that I look at it
with fresh eyes I think it might be more appropriate to adjust this in
push_ip_header, as the same concern with pushed headers potentially
exceeding the intended MTU post segmentation also exists with non-udp
tunnels.

I'll send in a new version of this patch, but it shouldn't affect any
performance tests in progress.

Cheers,
M

>
>
> > diff --git a/lib/netdev.c b/lib/netdev.c
> > index f2d921ed6..1d59bbe5d 100644
> > --- a/lib/netdev.c
> > +++ b/lib/netdev.c
> > @@ -69,8 +69,6 @@ COVERAGE_DEFINE(netdev_received);
> >  COVERAGE_DEFINE(netdev_sent);
> >  COVERAGE_DEFINE(netdev_add_router);
> >  COVERAGE_DEFINE(netdev_get_stats);
> > -COVERAGE_DEFINE(netdev_vxlan_tso_drops);
> > -COVERAGE_DEFINE(netdev_geneve_tso_drops);
> >  COVERAGE_DEFINE(netdev_push_header_drops);
> >  COVERAGE_DEFINE(netdev_soft_seg_good);
> >  COVERAGE_DEFINE(netdev_soft_seg_drops);
> > @@ -910,28 +908,23 @@ netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
> >      struct dp_packet *packet;
> >      int error;
> >
> > -    if (userspace_tso_enabled() &&
> > -        !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> > -        DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > -            if (dp_packet_hwol_is_tso(packet)) {
> > -                if (dp_packet_hwol_is_tunnel_vxlan(packet)
> > -                    && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) {
> > -                        VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support",
> > -                                     netdev_get_name(netdev));
> > -                        COVERAGE_INC(netdev_vxlan_tso_drops);
> > -                        dp_packet_delete_batch(batch, true);
> > -                        return false;
> > +    if (userspace_tso_enabled()) {
> > +        if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> > +            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +                if (dp_packet_hwol_is_tso(packet)) {
> > +                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
> >                  }
> > -
> > -                if (dp_packet_hwol_is_tunnel_geneve(packet)
> > -                    && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) {
> > -                        VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support",
> > -                                     netdev_get_name(netdev));
> > -                        COVERAGE_INC(netdev_geneve_tso_drops);
> > -                        dp_packet_delete_batch(batch, true);
> > -                        return false;
> > +            }
> > +        } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
> > +                                     NETDEV_TX_GENEVE_TNL_TSO))) {
>
> Only checking those flags does not seem to work for me.
> But this seems due to a driver bug.
>
> Example on my system, with CX-5:
>
> # ovs-vsctl get interface dpdk0 status | tr ',' '\n'
> {bus_info="bus_name=pci
>  vendor_id=15b3
>  device_id=1019"
>  driver_name=mlx5_pci
>  if_descr="DPDK 23.11.0 mlx5_pci"
>  if_type="6"
>  link_speed="100Gbps"
>  max_hash_mac_addrs="0"
>  max_mac_addrs="128"
>  max_rx_pktlen="1518"
>  max_rx_queues="1024"
>  max_tx_queues="1024"
>  max_vfs="0"
>  max_vmdq_pools="0"
>  min_rx_bufsize="32"
>  n_rxq="1"
>  n_txq="5"
>  numa_id="1"
>  port_no="2"
>  rx-steering=rss
>  rx_csum_offload="true"
>  tx_geneve_tso_offload="true"
>  ^^^
>
>  tx_ip_csum_offload="true"
>  tx_out_ip_csum_offload="true"
>  tx_out_udp_csum_offload="false"
>  ^^^
>
>  tx_sctp_csum_offload="false"
>  tx_tcp_csum_offload="true"
>  tx_tcp_seg_offload="true"
>  tx_udp_csum_offload="true"
>  tx_vxlan_tso_offload="true"}
>  ^^^
>
> net/mlx5 is claiming (geneve|vxlan)_tso_offload while not supporting
> outer udp checksum.
>
> I added the hunk below for now:
>
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 3cdcf8f8d..077768832 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -1413,14 +1413,16 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>                        netdev_get_name(&dev->up));
>          }
>
> -        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM &&
> +            info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
>              dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
>          } else {
>              VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
>                        netdev_get_name(&dev->up));
>          }
>
> -        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM &&
> +            info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
>              dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
>          } else {
>              VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
>
>
> > +            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +                if (!dp_packet_hwol_is_tso(packet)) {
> > +                    continue;
> > +                }
> > +                if (dp_packet_hwol_is_tunnel_vxlan(packet) ||
> > +                    dp_packet_hwol_is_tunnel_geneve(packet)) {
> > +                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
> >                  }
> > -                return netdev_send_tso(netdev, qid, batch, concurrent_txq);
> >              }
> >          }
> >      }
>
>
> --
> David Marchand
>
diff mbox series

Patch

diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
index 847685ad9..dc43ad662 100644
--- a/lib/dp-packet-gso.c
+++ b/lib/dp-packet-gso.c
@@ -47,6 +47,8 @@  dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
     seg->l2_5_ofs = p->l2_5_ofs;
     seg->l3_ofs = p->l3_ofs;
     seg->l4_ofs = p->l4_ofs;
+    seg->inner_l3_ofs = p->inner_l3_ofs;
+    seg->inner_l4_ofs = p->inner_l4_ofs;
 
     /* The protocol headers remain the same, so preserve hash and mark. */
     *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p);
@@ -71,7 +73,12 @@  dp_packet_gso_nr_segs(struct dp_packet *p)
     const char *data_tail;
     const char *data_pos;
 
-    data_pos = dp_packet_get_tcp_payload(p);
+    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
+        dp_packet_hwol_is_tunnel_geneve(p)) {
+        data_pos = dp_packet_get_inner_tcp_payload(p);
+    } else {
+        data_pos = dp_packet_get_tcp_payload(p);
+    }
     data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
 
     return DIV_ROUND_UP(data_tail - data_pos, segsz);
@@ -89,14 +96,19 @@  dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
     struct dp_packet_batch *curr_batch = *batches;
     struct tcp_header *tcp_hdr;
+    struct udp_header *tnl_hdr;
     struct ip_header *ip_hdr;
+    uint16_t inner_ip_id = 0;
+    uint16_t outer_ip_id = 0;
     struct dp_packet *seg;
+    const char *data_pos;
     uint16_t tcp_offset;
     uint16_t tso_segsz;
     uint32_t tcp_seq;
-    uint16_t ip_id;
+    bool outer_ipv4;
     int hdr_len;
     int seg_len;
+    bool tnl;
 
     tso_segsz = dp_packet_get_tso_segsz(p);
     if (!tso_segsz) {
@@ -105,20 +117,38 @@  dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
         return false;
     }
 
-    tcp_hdr = dp_packet_l4(p);
-    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
-    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
-    hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
-              + tcp_offset * 4;
-    ip_id = 0;
-    if (dp_packet_hwol_is_ipv4(p)) {
+    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
+            dp_packet_hwol_is_tunnel_geneve(p)) {
+        data_pos =  dp_packet_get_inner_tcp_payload(p);
+        outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p);
+        tcp_hdr = dp_packet_inner_l4(p);
+        ip_hdr = dp_packet_inner_l3(p);
+        tnl = true;
+
+        if (outer_ipv4) {
+            outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id);
+        }
+        if (dp_packet_hwol_is_ipv4(p)) {
+            inner_ip_id = ntohs(ip_hdr->ip_id);
+        }
+    } else {
+        data_pos = dp_packet_get_tcp_payload(p);
+        outer_ipv4 = dp_packet_hwol_is_ipv4(p);
+        tcp_hdr = dp_packet_l4(p);
         ip_hdr = dp_packet_l3(p);
-        ip_id = ntohs(ip_hdr->ip_id);
+        tnl = false;
+
+        if (outer_ipv4) {
+            outer_ip_id = ntohs(ip_hdr->ip_id);
+        }
     }
 
+    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
+    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
+    hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p))
+              + tcp_offset * 4;
     const char *data_tail = (char *) dp_packet_tail(p)
                             - dp_packet_l2_pad_size(p);
-    const char *data_pos = dp_packet_get_tcp_payload(p);
     int n_segs = dp_packet_gso_nr_segs(p);
 
     for (int i = 0; i < n_segs; i++) {
@@ -130,14 +160,35 @@  dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
         seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
         data_pos += seg_len;
 
+        if (tnl) {
+            /* Update tunnel inner L3 header. */
+            if (dp_packet_hwol_is_ipv4(seg)) {
+                ip_hdr = dp_packet_inner_l3(seg);
+                ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
+                                           dp_packet_inner_l4_size(seg));
+                ip_hdr->ip_id = htons(inner_ip_id);
+                ip_hdr->ip_csum = 0;
+                inner_ip_id++;
+            } else {
+                struct ovs_16aligned_ip6_hdr *ip6_hdr;
+
+                ip6_hdr = dp_packet_inner_l3(seg);
+                ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
+                    = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
+            }
+        }
+
         /* Update L3 header. */
-        if (dp_packet_hwol_is_ipv4(seg)) {
+        if (outer_ipv4) {
+            uint16_t ip_tot_len;
             ip_hdr = dp_packet_l3(seg);
-            ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
-                                       dp_packet_l4_size(seg));
-            ip_hdr->ip_id = htons(ip_id);
+            tnl_hdr = dp_packet_l4(seg);
+            ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg);
+            ip_hdr->ip_tot_len = htons(ip_tot_len);
+            ip_hdr->ip_id = htons(outer_ip_id);
             ip_hdr->ip_csum = 0;
-            ip_id++;
+            tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN);
+            outer_ip_id++;
         } else {
             struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
 
@@ -146,7 +197,11 @@  dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
         }
 
         /* Update L4 header. */
-        tcp_hdr = dp_packet_l4(seg);
+        if (tnl) {
+            tcp_hdr = dp_packet_inner_l4(seg);
+        } else {
+            tcp_hdr = dp_packet_l4(seg);
+        }
         put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
         tcp_seq += seg_len;
         if (OVS_LIKELY(i < (n_segs - 1))) {
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index a75b1c5cd..af1ff088f 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -529,6 +529,16 @@  dp_packet_inner_l3(const struct dp_packet *b)
            : NULL;
 }
 
+static inline size_t
+dp_packet_inner_l3_size(const struct dp_packet *b)
+{
+    return OVS_LIKELY(b->inner_l3_ofs != UINT16_MAX)
+           ? (const char *) dp_packet_tail(b)
+           - (const char *) dp_packet_inner_l3(b)
+           - dp_packet_l2_pad_size(b)
+           : 0;
+}
+
 static inline void *
 dp_packet_inner_l4(const struct dp_packet *b)
 {
@@ -563,6 +573,22 @@  dp_packet_get_tcp_payload(const struct dp_packet *b)
     return NULL;
 }
 
+static inline const void *
+dp_packet_get_inner_tcp_payload(const struct dp_packet *b)
+{
+    size_t l4_size = dp_packet_inner_l4_size(b);
+
+    if (OVS_LIKELY(l4_size >= TCP_HEADER_LEN)) {
+        struct tcp_header *tcp = dp_packet_inner_l4(b);
+        int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
+
+        if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= l4_size)) {
+            return (const char *) tcp + tcp_len;
+        }
+    }
+    return NULL;
+}
+
 static inline uint32_t
 dp_packet_get_tcp_payload_length(const struct dp_packet *pkt)
 {
@@ -1320,6 +1346,14 @@  dp_packet_hwol_reset_tcp_seg(struct dp_packet *p)
         ol_flags |= DP_PACKET_OL_TX_IP_CKSUM;
     }
 
+    if (ol_flags & (DP_PACKET_OL_TX_TUNNEL_VXLAN |
+                    DP_PACKET_OL_TX_TUNNEL_GENEVE)) {
+        if (ol_flags & DP_PACKET_OL_TX_OUTER_IPV4) {
+            ol_flags |= DP_PACKET_OL_TX_OUTER_IP_CKSUM;
+        }
+        ol_flags |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM;
+    }
+
     *dp_packet_ol_flags_ptr(p) = ol_flags;
 }
 
diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
index 0f9f07f44..22cc858e7 100644
--- a/lib/netdev-native-tnl.c
+++ b/lib/netdev-native-tnl.c
@@ -308,6 +308,14 @@  netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED,
     if (l4_ofs != UINT16_MAX) {
         packet->inner_l4_ofs = l4_ofs + data->header_len;
     }
+
+    if (dp_packet_hwol_is_tso(packet)) {
+        uint16_t tso_segsz = dp_packet_get_tso_segsz(packet);
+        if (tso_segsz > data->header_len) {
+            tso_segsz -= data->header_len;
+            dp_packet_set_tso_segsz(packet, tso_segsz);
+        }
+    }
 }
 
 static void *
diff --git a/lib/netdev.c b/lib/netdev.c
index f2d921ed6..1d59bbe5d 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -69,8 +69,6 @@  COVERAGE_DEFINE(netdev_received);
 COVERAGE_DEFINE(netdev_sent);
 COVERAGE_DEFINE(netdev_add_router);
 COVERAGE_DEFINE(netdev_get_stats);
-COVERAGE_DEFINE(netdev_vxlan_tso_drops);
-COVERAGE_DEFINE(netdev_geneve_tso_drops);
 COVERAGE_DEFINE(netdev_push_header_drops);
 COVERAGE_DEFINE(netdev_soft_seg_good);
 COVERAGE_DEFINE(netdev_soft_seg_drops);
@@ -910,28 +908,23 @@  netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
     struct dp_packet *packet;
     int error;
 
-    if (userspace_tso_enabled() &&
-        !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
-        DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
-            if (dp_packet_hwol_is_tso(packet)) {
-                if (dp_packet_hwol_is_tunnel_vxlan(packet)
-                    && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) {
-                        VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support",
-                                     netdev_get_name(netdev));
-                        COVERAGE_INC(netdev_vxlan_tso_drops);
-                        dp_packet_delete_batch(batch, true);
-                        return false;
+    if (userspace_tso_enabled()) {
+        if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
+            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+                if (dp_packet_hwol_is_tso(packet)) {
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
                 }
-
-                if (dp_packet_hwol_is_tunnel_geneve(packet)
-                    && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) {
-                        VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support",
-                                     netdev_get_name(netdev));
-                        COVERAGE_INC(netdev_geneve_tso_drops);
-                        dp_packet_delete_batch(batch, true);
-                        return false;
+            }
+        } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
+                                     NETDEV_TX_GENEVE_TNL_TSO))) {
+            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+                if (!dp_packet_hwol_is_tso(packet)) {
+                    continue;
+                }
+                if (dp_packet_hwol_is_tunnel_vxlan(packet) ||
+                    dp_packet_hwol_is_tunnel_geneve(packet)) {
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
                 }
-                return netdev_send_tso(netdev, qid, batch, concurrent_txq);
             }
         }
     }
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 3f1a15445..dd30b6bf5 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -351,6 +351,93 @@  OVS_WAIT_UNTIL([diff -q payload.bin udp_data])
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([datapath - tcp over vxlan tunnel with software fallback])
+AT_SKIP_IF([test $HAVE_NC = no])
+AT_SKIP_IF([test $HAVE_TCPDUMP = no])
+OVS_CHECK_VXLAN()
+
+dnl This test is only valid with tso. If the kernel segments the packets, the
+dnl packet lengths in the final test will be different.
+m4_ifndef([CHECK_SYSTEM_TSO], [AT_SKIP_IF(:)])
+
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-underlay])
+
+AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
+AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"])
+
+ADD_NAMESPACES(at_ns0)
+
+dnl Set up underlay link from host into the namespace using veth pair.
+ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24")
+AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"])
+AT_CHECK([ip link set dev br-underlay up])
+
+dnl Test the case where only one side has all checksum and tso offload disabled.
+AT_CHECK([ethtool -K ovs-p0 tso off], [0], [ignore], [ignore])
+AT_CHECK([ethtool -K ovs-p0 sg off], [0], [ignore], [ignore])
+
+dnl Reinitialize.
+AT_CHECK([ovs-vsctl del-port ovs-p0])
+AT_CHECK([ovs-vsctl add-port br-underlay ovs-p0])
+
+dnl Set up tunnel endpoints on OVS outside the namespace and with a native
+dnl linux device inside the namespace.
+ADD_OVS_TUNNEL([vxlan], [br0], [at_vxlan0], [172.31.1.1], [10.1.1.100/24])
+ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24],
+                  [id 0 dstport 4789])
+
+dnl First, check the underlay.
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Check that the tunnel is up.
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Start tcpdump to capture the encapsulated packets.
+OVS_DAEMONIZE([tcpdump -i ovs-p0 -w p0.pcap], [tcpdump.pid])
+sleep 1
+
+dnl Initialize the listener before it is needed.
+NETNS_DAEMONIZE([at_ns0], [nc -l 10.1.1.1 1234 > data2], [nc.pid])
+
+dnl Verify that ncat is ready.
+OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :1234])])
+
+dnl Large TCP transfer aimed towards ovs-p0, which has TSO disabled.
+AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null])
+AT_CHECK([nc $NC_EOF_OPT 10.1.1.1 1234 < payload.bin])
+
+dnl Wait until transfer completes before checking.
+OVS_WAIT_WHILE([kill -0 $(cat nc.pid)])
+AT_CHECK([diff -q payload.bin data2], [0])
+
+dnl Stop OVS and tcpdump and verify the results.
+OVS_TRAFFIC_VSWITCHD_STOP
+
+AT_CHECK([kill -15 $(cat tcpdump.pid)])
+
+OVS_WAIT_WHILE([kill -0 $(cat tcpdump.pid)])
+
+ovs-pcap p0.pcap
+
+dnl The exact number of packets sent will vary, but we check that the largest segments have the correct
+dnl lengths and certain other fields.
+AT_CHECK([test $(ovs-pcap p0.pcap | grep -Ec dnl
+"^.{24}0800"dnl Ethernet
+"450005aa....4000..11....ac1f0164ac1f0101"dnl IP(len=1450, DF, UDP, 172.31.1.100->172.31.1.1)
+"....12b505960000"dnl UDP(len=1430, dport=4789)
+"0800000000000000"dnl VXLAN(gpid=0, vni=0)
+".{24}0800"dnl Ethernet
+"45000578....4000..06....0a0101640a010101"dnl IP(len=1400, DF, TCP, 10.1.1.100->10.1.1.1)
+"....04d2............................0000"dnl TCP(dport=1234
+) -ge 20])
+
+AT_CLEANUP
+
 AT_SETUP([datapath - ping vlan over vxlan tunnel])
 OVS_CHECK_TUNNEL_TSO()
 OVS_CHECK_VXLAN()