Message ID | 20240613025401.4089319-1-mkp@redhat.com |
---|---|
State | Changes Requested, archived |
Headers | show |
Series | [ovs-dev,v2] Userspace: Software fallback for UDP encapsulated TCP segmentation. | expand |
Context | Check | Description |
---|---|---|
ovsrobot/apply-robot | success | apply and check: success |
ovsrobot/github-robot-_Build_and_Test | success | github build: passed |
ovsrobot/intel-ovs-compilation | success | test: success |
Hi Mike, On Thu, Jun 13, 2024 at 4:54 AM Mike Pattrick <mkp@redhat.com> wrote: > > When sending packets that are flagged as requiring segmentation to an > interface that doens't support this feature, send the packet to the TSO > software fallback instead of dropping it. > > Signed-off-by: Mike Pattrick <mkp@redhat.com> I started testing this patch (still not finished), I have some questions and comments. > > --- > v2: > - Fixed udp tunnel length > - Added test that UDP headers are correct > - Split inner and outer ip_id into different counters > - Set tunnel flags in reset_tcp_seg > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > --- > lib/dp-packet-gso.c | 89 +++++++++++++++++++++++++++++++++-------- > lib/dp-packet.h | 34 ++++++++++++++++ > lib/netdev-native-tnl.c | 8 ++++ > lib/netdev.c | 37 +++++++---------- > tests/system-traffic.at | 87 ++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 216 insertions(+), 39 deletions(-) > > diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c > index 847685ad9..dc43ad662 100644 > --- a/lib/dp-packet-gso.c > +++ b/lib/dp-packet-gso.c > @@ -47,6 +47,8 @@ dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len, > seg->l2_5_ofs = p->l2_5_ofs; > seg->l3_ofs = p->l3_ofs; > seg->l4_ofs = p->l4_ofs; > + seg->inner_l3_ofs = p->inner_l3_ofs; > + seg->inner_l4_ofs = p->inner_l4_ofs; > > /* The protocol headers remain the same, so preserve hash and mark. */ > *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p); > @@ -71,7 +73,12 @@ dp_packet_gso_nr_segs(struct dp_packet *p) > const char *data_tail; > const char *data_pos; > > - data_pos = dp_packet_get_tcp_payload(p); > + if (dp_packet_hwol_is_tunnel_vxlan(p) || > + dp_packet_hwol_is_tunnel_geneve(p)) { > + data_pos = dp_packet_get_inner_tcp_payload(p); > + } else { > + data_pos = dp_packet_get_tcp_payload(p); > + } > data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p); > > return DIV_ROUND_UP(data_tail - data_pos, segsz); > @@ -89,14 +96,19 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); > struct dp_packet_batch *curr_batch = *batches; > struct tcp_header *tcp_hdr; > + struct udp_header *tnl_hdr; > struct ip_header *ip_hdr; > + uint16_t inner_ip_id = 0; > + uint16_t outer_ip_id = 0; > struct dp_packet *seg; > + const char *data_pos; > uint16_t tcp_offset; > uint16_t tso_segsz; > uint32_t tcp_seq; > - uint16_t ip_id; > + bool outer_ipv4; > int hdr_len; > int seg_len; > + bool tnl; > > tso_segsz = dp_packet_get_tso_segsz(p); > if (!tso_segsz) { > @@ -105,20 +117,38 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > return false; > } > > - tcp_hdr = dp_packet_l4(p); > - tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); > - tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); > - hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p)) > - + tcp_offset * 4; > - ip_id = 0; > - if (dp_packet_hwol_is_ipv4(p)) { > + if (dp_packet_hwol_is_tunnel_vxlan(p) || > + dp_packet_hwol_is_tunnel_geneve(p)) { > + data_pos = dp_packet_get_inner_tcp_payload(p); > + outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p); > + tcp_hdr = dp_packet_inner_l4(p); > + ip_hdr = dp_packet_inner_l3(p); > + tnl = true; > + > + if (outer_ipv4) { > + outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id); > + } > + if (dp_packet_hwol_is_ipv4(p)) { > + inner_ip_id = ntohs(ip_hdr->ip_id); > + } > + } else { > + data_pos = dp_packet_get_tcp_payload(p); > + outer_ipv4 = dp_packet_hwol_is_ipv4(p); > + tcp_hdr = dp_packet_l4(p); > ip_hdr = dp_packet_l3(p); > - ip_id = ntohs(ip_hdr->ip_id); > + tnl = false; > + > + if (outer_ipv4) { > + outer_ip_id = ntohs(ip_hdr->ip_id); > + } > } > > + tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); > + tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); > + hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p)) > + + tcp_offset * 4; > const char *data_tail = (char *) dp_packet_tail(p) > - dp_packet_l2_pad_size(p); > - const char *data_pos = dp_packet_get_tcp_payload(p); > int n_segs = dp_packet_gso_nr_segs(p); > > for (int i = 0; i < n_segs; i++) { > @@ -130,14 +160,35 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len); > data_pos += seg_len; > > + if (tnl) { > + /* Update tunnel inner L3 header. */ > + if (dp_packet_hwol_is_ipv4(seg)) { > + ip_hdr = dp_packet_inner_l3(seg); > + ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + > + dp_packet_inner_l4_size(seg)); > + ip_hdr->ip_id = htons(inner_ip_id); > + ip_hdr->ip_csum = 0; > + inner_ip_id++; > + } else { > + struct ovs_16aligned_ip6_hdr *ip6_hdr; > + > + ip6_hdr = dp_packet_inner_l3(seg); > + ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen > + = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr); > + } > + } > + > /* Update L3 header. */ > - if (dp_packet_hwol_is_ipv4(seg)) { > + if (outer_ipv4) { > + uint16_t ip_tot_len; > ip_hdr = dp_packet_l3(seg); > - ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + > - dp_packet_l4_size(seg)); > - ip_hdr->ip_id = htons(ip_id); > + tnl_hdr = dp_packet_l4(seg); > + ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg); > + ip_hdr->ip_tot_len = htons(ip_tot_len); > + ip_hdr->ip_id = htons(outer_ip_id); > ip_hdr->ip_csum = 0; > - ip_id++; > + tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN); IIUC, this part is common for non-tunneled and tunneled packets but it does not make sense to update tnl_hdr->udp_len for non-tunneled packets. Plus, udp_len is not adjusted in the ipv6 case so the announced length is that of the original TSO frame. udp_len update should probably move in the if (tnl) { previous block. Something like: @@ -176,18 +177,19 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr); } + + tnl_hdr = dp_packet_l4(seg); + tnl_hdr->udp_len = htons(dp_packet_l4_size(seg)); } /* Update L3 header. */ > + outer_ip_id++; > } else { > struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg); > > @@ -146,7 +197,11 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > } > > /* Update L4 header. */ > - tcp_hdr = dp_packet_l4(seg); > + if (tnl) { > + tcp_hdr = dp_packet_inner_l4(seg); > + } else { > + tcp_hdr = dp_packet_l4(seg); > + } > put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq)); > tcp_seq += seg_len; > if (OVS_LIKELY(i < (n_segs - 1))) { > diff --git a/lib/dp-packet.h b/lib/dp-packet.h > index a75b1c5cd..af1ff088f 100644 > --- a/lib/dp-packet.h > +++ b/lib/dp-packet.h > @@ -529,6 +529,16 @@ dp_packet_inner_l3(const struct dp_packet *b) > : NULL; > } > > +static inline size_t > +dp_packet_inner_l3_size(const struct dp_packet *b) > +{ > + return OVS_LIKELY(b->inner_l3_ofs != UINT16_MAX) > + ? (const char *) dp_packet_tail(b) > + - (const char *) dp_packet_inner_l3(b) > + - dp_packet_l2_pad_size(b) > + : 0; > +} > + > static inline void * > dp_packet_inner_l4(const struct dp_packet *b) > { > @@ -563,6 +573,22 @@ dp_packet_get_tcp_payload(const struct dp_packet *b) > return NULL; > } > > +static inline const void * > +dp_packet_get_inner_tcp_payload(const struct dp_packet *b) > +{ > + size_t l4_size = dp_packet_inner_l4_size(b); > + > + if (OVS_LIKELY(l4_size >= TCP_HEADER_LEN)) { > + struct tcp_header *tcp = dp_packet_inner_l4(b); > + int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; > + > + if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= l4_size)) { > + return (const char *) tcp + tcp_len; > + } > + } > + return NULL; > +} > + > static inline uint32_t > dp_packet_get_tcp_payload_length(const struct dp_packet *pkt) > { > @@ -1320,6 +1346,14 @@ dp_packet_hwol_reset_tcp_seg(struct dp_packet *p) > ol_flags |= DP_PACKET_OL_TX_IP_CKSUM; > } > > + if (ol_flags & (DP_PACKET_OL_TX_TUNNEL_VXLAN | > + DP_PACKET_OL_TX_TUNNEL_GENEVE)) { > + if (ol_flags & DP_PACKET_OL_TX_OUTER_IPV4) { > + ol_flags |= DP_PACKET_OL_TX_OUTER_IP_CKSUM; > + } > + ol_flags |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM; > + } > + > *dp_packet_ol_flags_ptr(p) = ol_flags; > } > > diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c > index 0f9f07f44..22cc858e7 100644 > --- a/lib/netdev-native-tnl.c > +++ b/lib/netdev-native-tnl.c > @@ -308,6 +308,14 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, > if (l4_ofs != UINT16_MAX) { > packet->inner_l4_ofs = l4_ofs + data->header_len; > } > + > + if (dp_packet_hwol_is_tso(packet)) { > + uint16_t tso_segsz = dp_packet_get_tso_segsz(packet); > + if (tso_segsz > data->header_len) { > + tso_segsz -= data->header_len; > + dp_packet_set_tso_segsz(packet, tso_segsz); > + } > + } > } > > static void * Is this hunk a fix valid even for hw-assisted tso? If so, I would make it a separate patch. > diff --git a/lib/netdev.c b/lib/netdev.c > index f2d921ed6..1d59bbe5d 100644 > --- a/lib/netdev.c > +++ b/lib/netdev.c > @@ -69,8 +69,6 @@ COVERAGE_DEFINE(netdev_received); > COVERAGE_DEFINE(netdev_sent); > COVERAGE_DEFINE(netdev_add_router); > COVERAGE_DEFINE(netdev_get_stats); > -COVERAGE_DEFINE(netdev_vxlan_tso_drops); > -COVERAGE_DEFINE(netdev_geneve_tso_drops); > COVERAGE_DEFINE(netdev_push_header_drops); > COVERAGE_DEFINE(netdev_soft_seg_good); > COVERAGE_DEFINE(netdev_soft_seg_drops); > @@ -910,28 +908,23 @@ netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, > struct dp_packet *packet; > int error; > > - if (userspace_tso_enabled() && > - !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { > - DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > - if (dp_packet_hwol_is_tso(packet)) { > - if (dp_packet_hwol_is_tunnel_vxlan(packet) > - && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) { > - VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support", > - netdev_get_name(netdev)); > - COVERAGE_INC(netdev_vxlan_tso_drops); > - dp_packet_delete_batch(batch, true); > - return false; > + if (userspace_tso_enabled()) { > + if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + if (dp_packet_hwol_is_tso(packet)) { > + return netdev_send_tso(netdev, qid, batch, concurrent_txq); > } > - > - if (dp_packet_hwol_is_tunnel_geneve(packet) > - && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) { > - VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support", > - netdev_get_name(netdev)); > - COVERAGE_INC(netdev_geneve_tso_drops); > - dp_packet_delete_batch(batch, true); > - return false; > + } > + } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO | > + NETDEV_TX_GENEVE_TNL_TSO))) { Only checking those flags does not seem to work for me. But this seems due to a driver bug. Example on my system, with CX-5: # ovs-vsctl get interface dpdk0 status | tr ',' '\n' {bus_info="bus_name=pci vendor_id=15b3 device_id=1019" driver_name=mlx5_pci if_descr="DPDK 23.11.0 mlx5_pci" if_type="6" link_speed="100Gbps" max_hash_mac_addrs="0" max_mac_addrs="128" max_rx_pktlen="1518" max_rx_queues="1024" max_tx_queues="1024" max_vfs="0" max_vmdq_pools="0" min_rx_bufsize="32" n_rxq="1" n_txq="5" numa_id="1" port_no="2" rx-steering=rss rx_csum_offload="true" tx_geneve_tso_offload="true" ^^^ tx_ip_csum_offload="true" tx_out_ip_csum_offload="true" tx_out_udp_csum_offload="false" ^^^ tx_sctp_csum_offload="false" tx_tcp_csum_offload="true" tx_tcp_seg_offload="true" tx_udp_csum_offload="true" tx_vxlan_tso_offload="true"} ^^^ net/mlx5 is claiming (geneve|vxlan)_tso_offload while not supporting outer udp checksum. I added the hunk below for now: diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 3cdcf8f8d..077768832 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1413,14 +1413,16 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) netdev_get_name(&dev->up)); } - if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) { + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM && + info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) { dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD; } else { VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.", netdev_get_name(&dev->up)); } - if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) { + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM && + info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) { dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD; } else { VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.", > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + if (!dp_packet_hwol_is_tso(packet)) { > + continue; > + } > + if (dp_packet_hwol_is_tunnel_vxlan(packet) || > + dp_packet_hwol_is_tunnel_geneve(packet)) { > + return netdev_send_tso(netdev, qid, batch, concurrent_txq); > } > - return netdev_send_tso(netdev, qid, batch, concurrent_txq); > } > } > }
Some additional comments. On Thu, Jun 13, 2024 at 4:54 AM Mike Pattrick <mkp@redhat.com> wrote: > @@ -89,14 +96,19 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); > struct dp_packet_batch *curr_batch = *batches; > struct tcp_header *tcp_hdr; > + struct udp_header *tnl_hdr; > struct ip_header *ip_hdr; > + uint16_t inner_ip_id = 0; > + uint16_t outer_ip_id = 0; > struct dp_packet *seg; > + const char *data_pos; > uint16_t tcp_offset; > uint16_t tso_segsz; > uint32_t tcp_seq; > - uint16_t ip_id; > + bool outer_ipv4; > int hdr_len; > int seg_len; > + bool tnl; > > tso_segsz = dp_packet_get_tso_segsz(p); > if (!tso_segsz) { > @@ -105,20 +117,38 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > return false; > } > > - tcp_hdr = dp_packet_l4(p); > - tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); > - tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); > - hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p)) > - + tcp_offset * 4; > - ip_id = 0; > - if (dp_packet_hwol_is_ipv4(p)) { > + if (dp_packet_hwol_is_tunnel_vxlan(p) || > + dp_packet_hwol_is_tunnel_geneve(p)) { > + data_pos = dp_packet_get_inner_tcp_payload(p); > + outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p); > + tcp_hdr = dp_packet_inner_l4(p); > + ip_hdr = dp_packet_inner_l3(p); > + tnl = true; > + > + if (outer_ipv4) { > + outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id); > + } > + if (dp_packet_hwol_is_ipv4(p)) { > + inner_ip_id = ntohs(ip_hdr->ip_id); > + } > + } else { > + data_pos = dp_packet_get_tcp_payload(p); > + outer_ipv4 = dp_packet_hwol_is_ipv4(p); > + tcp_hdr = dp_packet_l4(p); > ip_hdr = dp_packet_l3(p); > - ip_id = ntohs(ip_hdr->ip_id); > + tnl = false; > + > + if (outer_ipv4) { > + outer_ip_id = ntohs(ip_hdr->ip_id); > + } > } > > + tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); > + tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); > + hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p)) > + + tcp_offset * 4; > const char *data_tail = (char *) dp_packet_tail(p) > - dp_packet_l2_pad_size(p); > - const char *data_pos = dp_packet_get_tcp_payload(p); Not a strong opinion but data_pos init could be kept here. const char *data_pos = (char *) tcp_hdr + tcp_offset * 4; > int n_segs = dp_packet_gso_nr_segs(p); > > for (int i = 0; i < n_segs; i++) { > @@ -130,14 +160,35 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len); > data_pos += seg_len; > > + if (tnl) { > + /* Update tunnel inner L3 header. */ > + if (dp_packet_hwol_is_ipv4(seg)) { > + ip_hdr = dp_packet_inner_l3(seg); > + ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + > + dp_packet_inner_l4_size(seg)); ip_hdr->ip_tot_len = htons(dp_packet_inner_l3_size(seg)); is easier to read (and more consistent with the ipv6 block that follows). > + ip_hdr->ip_id = htons(inner_ip_id); > + ip_hdr->ip_csum = 0; > + inner_ip_id++; > + } else { > + struct ovs_16aligned_ip6_hdr *ip6_hdr; > + > + ip6_hdr = dp_packet_inner_l3(seg); > + ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen > + = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr); > + } > + } > + > /* Update L3 header. */ > - if (dp_packet_hwol_is_ipv4(seg)) { > + if (outer_ipv4) { > + uint16_t ip_tot_len; > ip_hdr = dp_packet_l3(seg); > - ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + > - dp_packet_l4_size(seg)); > - ip_hdr->ip_id = htons(ip_id); > + tnl_hdr = dp_packet_l4(seg); > + ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg); ip_hdr->ip_tot_len = htons(dp_packet_l3_size(seg)); > + ip_hdr->ip_tot_len = htons(ip_tot_len); > + ip_hdr->ip_id = htons(outer_ip_id); > ip_hdr->ip_csum = 0; > - ip_id++; > + tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN); > + outer_ip_id++; > } else { > struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg); >
On Fri, Jun 21, 2024 at 5:07 PM David Marchand <david.marchand@redhat.com> wrote: > net/mlx5 is claiming (geneve|vxlan)_tso_offload while not supporting > outer udp checksum. This may be a set of capabilities we can use if, for example, no udp checksum is requested. How about adding: # git diff diff --git a/lib/netdev.c b/lib/netdev.c index 1d59bbe5d..429c86a22 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -926,6 +926,17 @@ netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, return netdev_send_tso(netdev, qid, batch, concurrent_txq); } } + } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) { + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (!dp_packet_hwol_is_tso(packet) && + !dp_packet_hwol_is_tunnel_vxlan(packet) && + !dp_packet_hwol_is_tunnel_geneve(packet)) { + continue; + } + if (dp_packet_hwol_is_outer_udp_cksum(packet)) { + return netdev_send_tso(netdev, qid, batch, concurrent_txq); + } + } } } Here are some numbers on mlx5, with the udp length fix I mentionned previously + this hunk above: # Sw GSO when csum enabled - ovs-vsctl del-port tunnel0 -- add-port br-int tunnel0 -- set interface tunnel0 type=vxlan options:remote_ip=172.31.3.1 options:key=0 options:csum=true Connecting to host 172.31.2.2, port 5201 Reverse mode, remote host 172.31.2.2 is sending [ 5] local 172.31.2.1 port 58748 connected to 172.31.2.2 port 5201 [ ID] Interval Transfer Bitrate [ 5] 0.00-1.00 sec 580 MBytes 4.87 Gbits/sec - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-1.04 sec 582 MBytes 4.70 Gbits/sec 0 sender [ 5] 0.00-1.00 sec 580 MBytes 4.87 Gbits/sec receiver iperf Done. # Hw GSO when csum disabled - ovs-vsctl del-port tunnel0 -- add-port br-int tunnel0 -- set interface tunnel0 type=vxlan options:remote_ip=172.31.3.1 options:key=0 options:csum=false Connecting to host 172.31.2.2, port 5201 Reverse mode, remote host 172.31.2.2 is sending [ 5] local 172.31.2.1 port 39080 connected to 172.31.2.2 port 5201 [ ID] Interval Transfer Bitrate [ 5] 0.00-1.00 sec 1.50 GBytes 12.9 Gbits/sec - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-1.04 sec 1.50 GBytes 12.4 Gbits/sec 213 sender [ 5] 0.00-1.00 sec 1.50 GBytes 12.9 Gbits/sec receiver iperf Done.
On Thu, Jun 27, 2024 at 4:04 PM David Marchand <david.marchand@redhat.com> wrote: > @@ -926,6 +926,17 @@ netdev_send(struct netdev *netdev, int qid, > struct dp_packet_batch *batch, > return netdev_send_tso(netdev, qid, batch, concurrent_txq); > } > } > + } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) { > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + if (!dp_packet_hwol_is_tso(packet) && > + !dp_packet_hwol_is_tunnel_vxlan(packet) && > + !dp_packet_hwol_is_tunnel_geneve(packet)) { > + continue; Erm, less buggy: + if (!dp_packet_hwol_is_tso(packet) || + (!dp_packet_hwol_is_tunnel_vxlan(packet) && + !dp_packet_hwol_is_tunnel_geneve(packet))) { + continue; + } > + } > + if (dp_packet_hwol_is_outer_udp_cksum(packet)) { > + return netdev_send_tso(netdev, qid, batch, concurrent_txq); > + } > + } > } > }
On Fri, Jun 28, 2024 at 4:50 AM David Marchand <david.marchand@redhat.com> wrote: > > On Thu, Jun 27, 2024 at 4:04 PM David Marchand > <david.marchand@redhat.com> wrote: > > @@ -926,6 +926,17 @@ netdev_send(struct netdev *netdev, int qid, > > struct dp_packet_batch *batch, > > return netdev_send_tso(netdev, qid, batch, concurrent_txq); > > } > > } > > + } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) { > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + if (!dp_packet_hwol_is_tso(packet) && > > + !dp_packet_hwol_is_tunnel_vxlan(packet) && > > + !dp_packet_hwol_is_tunnel_geneve(packet)) { > > + continue; > > Erm, less buggy: Thanks for the review David! These are some good ideas. I'll send in a v3 with them included. Cheers, M > > + if (!dp_packet_hwol_is_tso(packet) || > + (!dp_packet_hwol_is_tunnel_vxlan(packet) && > + !dp_packet_hwol_is_tunnel_geneve(packet))) { > + continue; > + } > > > + } > > + if (dp_packet_hwol_is_outer_udp_cksum(packet)) { > > + return netdev_send_tso(netdev, qid, batch, concurrent_txq); > > + } > > + } > > } > > } > > > -- > David Marchand >
On Fri, Jun 21, 2024 at 11:08 AM David Marchand <david.marchand@redhat.com> wrote: > > Hi Mike, > > On Thu, Jun 13, 2024 at 4:54 AM Mike Pattrick <mkp@redhat.com> wrote: > > > > When sending packets that are flagged as requiring segmentation to an > > interface that doens't support this feature, send the packet to the TSO > > software fallback instead of dropping it. > > > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > > I started testing this patch (still not finished), I have some > questions and comments. > > > > > --- > > v2: > > - Fixed udp tunnel length > > - Added test that UDP headers are correct > > - Split inner and outer ip_id into different counters > > - Set tunnel flags in reset_tcp_seg > > > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > > --- > > lib/dp-packet-gso.c | 89 +++++++++++++++++++++++++++++++++-------- > > lib/dp-packet.h | 34 ++++++++++++++++ > > lib/netdev-native-tnl.c | 8 ++++ > > lib/netdev.c | 37 +++++++---------- > > tests/system-traffic.at | 87 ++++++++++++++++++++++++++++++++++++++++ > > 5 files changed, 216 insertions(+), 39 deletions(-) > > > > diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c > > index 847685ad9..dc43ad662 100644 > > --- a/lib/dp-packet-gso.c > > +++ b/lib/dp-packet-gso.c > > @@ -47,6 +47,8 @@ dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len, > > seg->l2_5_ofs = p->l2_5_ofs; > > seg->l3_ofs = p->l3_ofs; > > seg->l4_ofs = p->l4_ofs; > > + seg->inner_l3_ofs = p->inner_l3_ofs; > > + seg->inner_l4_ofs = p->inner_l4_ofs; > > > > /* The protocol headers remain the same, so preserve hash and mark. */ > > *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p); > > @@ -71,7 +73,12 @@ dp_packet_gso_nr_segs(struct dp_packet *p) > > const char *data_tail; > > const char *data_pos; > > > > - data_pos = dp_packet_get_tcp_payload(p); > > + if (dp_packet_hwol_is_tunnel_vxlan(p) || > > + dp_packet_hwol_is_tunnel_geneve(p)) { > > + data_pos = dp_packet_get_inner_tcp_payload(p); > > + } else { > > + data_pos = dp_packet_get_tcp_payload(p); > > + } > > data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p); > > > > return DIV_ROUND_UP(data_tail - data_pos, segsz); > > @@ -89,14 +96,19 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > > static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); > > struct dp_packet_batch *curr_batch = *batches; > > struct tcp_header *tcp_hdr; > > + struct udp_header *tnl_hdr; > > struct ip_header *ip_hdr; > > + uint16_t inner_ip_id = 0; > > + uint16_t outer_ip_id = 0; > > struct dp_packet *seg; > > + const char *data_pos; > > uint16_t tcp_offset; > > uint16_t tso_segsz; > > uint32_t tcp_seq; > > - uint16_t ip_id; > > + bool outer_ipv4; > > int hdr_len; > > int seg_len; > > + bool tnl; > > > > tso_segsz = dp_packet_get_tso_segsz(p); > > if (!tso_segsz) { > > @@ -105,20 +117,38 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > > return false; > > } > > > > - tcp_hdr = dp_packet_l4(p); > > - tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); > > - tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); > > - hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p)) > > - + tcp_offset * 4; > > - ip_id = 0; > > - if (dp_packet_hwol_is_ipv4(p)) { > > + if (dp_packet_hwol_is_tunnel_vxlan(p) || > > + dp_packet_hwol_is_tunnel_geneve(p)) { > > + data_pos = dp_packet_get_inner_tcp_payload(p); > > + outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p); > > + tcp_hdr = dp_packet_inner_l4(p); > > + ip_hdr = dp_packet_inner_l3(p); > > + tnl = true; > > + > > + if (outer_ipv4) { > > + outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id); > > + } > > + if (dp_packet_hwol_is_ipv4(p)) { > > + inner_ip_id = ntohs(ip_hdr->ip_id); > > + } > > + } else { > > + data_pos = dp_packet_get_tcp_payload(p); > > + outer_ipv4 = dp_packet_hwol_is_ipv4(p); > > + tcp_hdr = dp_packet_l4(p); > > ip_hdr = dp_packet_l3(p); > > - ip_id = ntohs(ip_hdr->ip_id); > > + tnl = false; > > + > > + if (outer_ipv4) { > > + outer_ip_id = ntohs(ip_hdr->ip_id); > > + } > > } > > > > + tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); > > + tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); > > + hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p)) > > + + tcp_offset * 4; > > const char *data_tail = (char *) dp_packet_tail(p) > > - dp_packet_l2_pad_size(p); > > - const char *data_pos = dp_packet_get_tcp_payload(p); > > int n_segs = dp_packet_gso_nr_segs(p); > > > > for (int i = 0; i < n_segs; i++) { > > @@ -130,14 +160,35 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > > seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len); > > data_pos += seg_len; > > > > + if (tnl) { > > + /* Update tunnel inner L3 header. */ > > + if (dp_packet_hwol_is_ipv4(seg)) { > > + ip_hdr = dp_packet_inner_l3(seg); > > + ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + > > + dp_packet_inner_l4_size(seg)); > > + ip_hdr->ip_id = htons(inner_ip_id); > > + ip_hdr->ip_csum = 0; > > + inner_ip_id++; > > + } else { > > + struct ovs_16aligned_ip6_hdr *ip6_hdr; > > + > > + ip6_hdr = dp_packet_inner_l3(seg); > > + ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen > > + = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr); > > + } > > + } > > + > > /* Update L3 header. */ > > - if (dp_packet_hwol_is_ipv4(seg)) { > > + if (outer_ipv4) { > > + uint16_t ip_tot_len; > > ip_hdr = dp_packet_l3(seg); > > - ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + > > - dp_packet_l4_size(seg)); > > - ip_hdr->ip_id = htons(ip_id); > > + tnl_hdr = dp_packet_l4(seg); > > + ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg); > > + ip_hdr->ip_tot_len = htons(ip_tot_len); > > + ip_hdr->ip_id = htons(outer_ip_id); > > ip_hdr->ip_csum = 0; > > - ip_id++; > > + tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN); > > IIUC, this part is common for non-tunneled and tunneled packets but it > does not make sense to update tnl_hdr->udp_len for non-tunneled > packets. > Plus, udp_len is not adjusted in the ipv6 case so the announced length > is that of the original TSO frame. > > udp_len update should probably move in the if (tnl) { previous block. > Something like: > @@ -176,18 +177,19 @@ dp_packet_gso(struct dp_packet *p, struct > dp_packet_batch **batches) > ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen > = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr); > } > + > + tnl_hdr = dp_packet_l4(seg); > + tnl_hdr->udp_len = htons(dp_packet_l4_size(seg)); > } > > /* Update L3 header. */ > > > + outer_ip_id++; > > } else { > > struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg); > > > > @@ -146,7 +197,11 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) > > } > > > > /* Update L4 header. */ > > - tcp_hdr = dp_packet_l4(seg); > > + if (tnl) { > > + tcp_hdr = dp_packet_inner_l4(seg); > > + } else { > > + tcp_hdr = dp_packet_l4(seg); > > + } > > put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq)); > > tcp_seq += seg_len; > > if (OVS_LIKELY(i < (n_segs - 1))) { > > diff --git a/lib/dp-packet.h b/lib/dp-packet.h > > index a75b1c5cd..af1ff088f 100644 > > --- a/lib/dp-packet.h > > +++ b/lib/dp-packet.h > > @@ -529,6 +529,16 @@ dp_packet_inner_l3(const struct dp_packet *b) > > : NULL; > > } > > > > +static inline size_t > > +dp_packet_inner_l3_size(const struct dp_packet *b) > > +{ > > + return OVS_LIKELY(b->inner_l3_ofs != UINT16_MAX) > > + ? (const char *) dp_packet_tail(b) > > + - (const char *) dp_packet_inner_l3(b) > > + - dp_packet_l2_pad_size(b) > > + : 0; > > +} > > + > > static inline void * > > dp_packet_inner_l4(const struct dp_packet *b) > > { > > @@ -563,6 +573,22 @@ dp_packet_get_tcp_payload(const struct dp_packet *b) > > return NULL; > > } > > > > +static inline const void * > > +dp_packet_get_inner_tcp_payload(const struct dp_packet *b) > > +{ > > + size_t l4_size = dp_packet_inner_l4_size(b); > > + > > + if (OVS_LIKELY(l4_size >= TCP_HEADER_LEN)) { > > + struct tcp_header *tcp = dp_packet_inner_l4(b); > > + int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; > > + > > + if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= l4_size)) { > > + return (const char *) tcp + tcp_len; > > + } > > + } > > + return NULL; > > +} > > + > > static inline uint32_t > > dp_packet_get_tcp_payload_length(const struct dp_packet *pkt) > > { > > @@ -1320,6 +1346,14 @@ dp_packet_hwol_reset_tcp_seg(struct dp_packet *p) > > ol_flags |= DP_PACKET_OL_TX_IP_CKSUM; > > } > > > > + if (ol_flags & (DP_PACKET_OL_TX_TUNNEL_VXLAN | > > + DP_PACKET_OL_TX_TUNNEL_GENEVE)) { > > + if (ol_flags & DP_PACKET_OL_TX_OUTER_IPV4) { > > + ol_flags |= DP_PACKET_OL_TX_OUTER_IP_CKSUM; > > + } > > + ol_flags |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM; > > + } > > + > > *dp_packet_ol_flags_ptr(p) = ol_flags; > > } > > > > diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c > > index 0f9f07f44..22cc858e7 100644 > > --- a/lib/netdev-native-tnl.c > > +++ b/lib/netdev-native-tnl.c > > @@ -308,6 +308,14 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, > > if (l4_ofs != UINT16_MAX) { > > packet->inner_l4_ofs = l4_ofs + data->header_len; > > } > > + > > + if (dp_packet_hwol_is_tso(packet)) { > > + uint16_t tso_segsz = dp_packet_get_tso_segsz(packet); > > + if (tso_segsz > data->header_len) { > > + tso_segsz -= data->header_len; > > + dp_packet_set_tso_segsz(packet, tso_segsz); > > + } > > + } > > } > > > > static void * > > Is this hunk a fix valid even for hw-assisted tso? > If so, I would make it a separate patch. I just realized that I had forgotten to extract this into a seperate patch. This code is actually over a year old but now that I look at it with fresh eyes I think it might be more appropriate to adjust this in push_ip_header, as the same concern with pushed headers potentially exceeding the intended MTU post segmentation also exists with non-udp tunnels. I'll send in a new version of this patch, but it shouldn't affect any performance tests in progress. Cheers, M > > > > diff --git a/lib/netdev.c b/lib/netdev.c > > index f2d921ed6..1d59bbe5d 100644 > > --- a/lib/netdev.c > > +++ b/lib/netdev.c > > @@ -69,8 +69,6 @@ COVERAGE_DEFINE(netdev_received); > > COVERAGE_DEFINE(netdev_sent); > > COVERAGE_DEFINE(netdev_add_router); > > COVERAGE_DEFINE(netdev_get_stats); > > -COVERAGE_DEFINE(netdev_vxlan_tso_drops); > > -COVERAGE_DEFINE(netdev_geneve_tso_drops); > > COVERAGE_DEFINE(netdev_push_header_drops); > > COVERAGE_DEFINE(netdev_soft_seg_good); > > COVERAGE_DEFINE(netdev_soft_seg_drops); > > @@ -910,28 +908,23 @@ netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, > > struct dp_packet *packet; > > int error; > > > > - if (userspace_tso_enabled() && > > - !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { > > - DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > - if (dp_packet_hwol_is_tso(packet)) { > > - if (dp_packet_hwol_is_tunnel_vxlan(packet) > > - && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) { > > - VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support", > > - netdev_get_name(netdev)); > > - COVERAGE_INC(netdev_vxlan_tso_drops); > > - dp_packet_delete_batch(batch, true); > > - return false; > > + if (userspace_tso_enabled()) { > > + if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + if (dp_packet_hwol_is_tso(packet)) { > > + return netdev_send_tso(netdev, qid, batch, concurrent_txq); > > } > > - > > - if (dp_packet_hwol_is_tunnel_geneve(packet) > > - && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) { > > - VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support", > > - netdev_get_name(netdev)); > > - COVERAGE_INC(netdev_geneve_tso_drops); > > - dp_packet_delete_batch(batch, true); > > - return false; > > + } > > + } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO | > > + NETDEV_TX_GENEVE_TNL_TSO))) { > > Only checking those flags does not seem to work for me. > But this seems due to a driver bug. > > Example on my system, with CX-5: > > # ovs-vsctl get interface dpdk0 status | tr ',' '\n' > {bus_info="bus_name=pci > vendor_id=15b3 > device_id=1019" > driver_name=mlx5_pci > if_descr="DPDK 23.11.0 mlx5_pci" > if_type="6" > link_speed="100Gbps" > max_hash_mac_addrs="0" > max_mac_addrs="128" > max_rx_pktlen="1518" > max_rx_queues="1024" > max_tx_queues="1024" > max_vfs="0" > max_vmdq_pools="0" > min_rx_bufsize="32" > n_rxq="1" > n_txq="5" > numa_id="1" > port_no="2" > rx-steering=rss > rx_csum_offload="true" > tx_geneve_tso_offload="true" > ^^^ > > tx_ip_csum_offload="true" > tx_out_ip_csum_offload="true" > tx_out_udp_csum_offload="false" > ^^^ > > tx_sctp_csum_offload="false" > tx_tcp_csum_offload="true" > tx_tcp_seg_offload="true" > tx_udp_csum_offload="true" > tx_vxlan_tso_offload="true"} > ^^^ > > net/mlx5 is claiming (geneve|vxlan)_tso_offload while not supporting > outer udp checksum. > > I added the hunk below for now: > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 3cdcf8f8d..077768832 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -1413,14 +1413,16 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) > netdev_get_name(&dev->up)); > } > > - if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) { > + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM && > + info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) { > dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD; > } else { > VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.", > netdev_get_name(&dev->up)); > } > > - if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) { > + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM && > + info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) { > dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD; > } else { > VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.", > > > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + if (!dp_packet_hwol_is_tso(packet)) { > > + continue; > > + } > > + if (dp_packet_hwol_is_tunnel_vxlan(packet) || > > + dp_packet_hwol_is_tunnel_geneve(packet)) { > > + return netdev_send_tso(netdev, qid, batch, concurrent_txq); > > } > > - return netdev_send_tso(netdev, qid, batch, concurrent_txq); > > } > > } > > } > > > -- > David Marchand >
diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c index 847685ad9..dc43ad662 100644 --- a/lib/dp-packet-gso.c +++ b/lib/dp-packet-gso.c @@ -47,6 +47,8 @@ dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len, seg->l2_5_ofs = p->l2_5_ofs; seg->l3_ofs = p->l3_ofs; seg->l4_ofs = p->l4_ofs; + seg->inner_l3_ofs = p->inner_l3_ofs; + seg->inner_l4_ofs = p->inner_l4_ofs; /* The protocol headers remain the same, so preserve hash and mark. */ *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p); @@ -71,7 +73,12 @@ dp_packet_gso_nr_segs(struct dp_packet *p) const char *data_tail; const char *data_pos; - data_pos = dp_packet_get_tcp_payload(p); + if (dp_packet_hwol_is_tunnel_vxlan(p) || + dp_packet_hwol_is_tunnel_geneve(p)) { + data_pos = dp_packet_get_inner_tcp_payload(p); + } else { + data_pos = dp_packet_get_tcp_payload(p); + } data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p); return DIV_ROUND_UP(data_tail - data_pos, segsz); @@ -89,14 +96,19 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); struct dp_packet_batch *curr_batch = *batches; struct tcp_header *tcp_hdr; + struct udp_header *tnl_hdr; struct ip_header *ip_hdr; + uint16_t inner_ip_id = 0; + uint16_t outer_ip_id = 0; struct dp_packet *seg; + const char *data_pos; uint16_t tcp_offset; uint16_t tso_segsz; uint32_t tcp_seq; - uint16_t ip_id; + bool outer_ipv4; int hdr_len; int seg_len; + bool tnl; tso_segsz = dp_packet_get_tso_segsz(p); if (!tso_segsz) { @@ -105,20 +117,38 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) return false; } - tcp_hdr = dp_packet_l4(p); - tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); - tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); - hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p)) - + tcp_offset * 4; - ip_id = 0; - if (dp_packet_hwol_is_ipv4(p)) { + if (dp_packet_hwol_is_tunnel_vxlan(p) || + dp_packet_hwol_is_tunnel_geneve(p)) { + data_pos = dp_packet_get_inner_tcp_payload(p); + outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p); + tcp_hdr = dp_packet_inner_l4(p); + ip_hdr = dp_packet_inner_l3(p); + tnl = true; + + if (outer_ipv4) { + outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id); + } + if (dp_packet_hwol_is_ipv4(p)) { + inner_ip_id = ntohs(ip_hdr->ip_id); + } + } else { + data_pos = dp_packet_get_tcp_payload(p); + outer_ipv4 = dp_packet_hwol_is_ipv4(p); + tcp_hdr = dp_packet_l4(p); ip_hdr = dp_packet_l3(p); - ip_id = ntohs(ip_hdr->ip_id); + tnl = false; + + if (outer_ipv4) { + outer_ip_id = ntohs(ip_hdr->ip_id); + } } + tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); + tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); + hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p)) + + tcp_offset * 4; const char *data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p); - const char *data_pos = dp_packet_get_tcp_payload(p); int n_segs = dp_packet_gso_nr_segs(p); for (int i = 0; i < n_segs; i++) { @@ -130,14 +160,35 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len); data_pos += seg_len; + if (tnl) { + /* Update tunnel inner L3 header. */ + if (dp_packet_hwol_is_ipv4(seg)) { + ip_hdr = dp_packet_inner_l3(seg); + ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + + dp_packet_inner_l4_size(seg)); + ip_hdr->ip_id = htons(inner_ip_id); + ip_hdr->ip_csum = 0; + inner_ip_id++; + } else { + struct ovs_16aligned_ip6_hdr *ip6_hdr; + + ip6_hdr = dp_packet_inner_l3(seg); + ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen + = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr); + } + } + /* Update L3 header. */ - if (dp_packet_hwol_is_ipv4(seg)) { + if (outer_ipv4) { + uint16_t ip_tot_len; ip_hdr = dp_packet_l3(seg); - ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + - dp_packet_l4_size(seg)); - ip_hdr->ip_id = htons(ip_id); + tnl_hdr = dp_packet_l4(seg); + ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg); + ip_hdr->ip_tot_len = htons(ip_tot_len); + ip_hdr->ip_id = htons(outer_ip_id); ip_hdr->ip_csum = 0; - ip_id++; + tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN); + outer_ip_id++; } else { struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg); @@ -146,7 +197,11 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) } /* Update L4 header. */ - tcp_hdr = dp_packet_l4(seg); + if (tnl) { + tcp_hdr = dp_packet_inner_l4(seg); + } else { + tcp_hdr = dp_packet_l4(seg); + } put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq)); tcp_seq += seg_len; if (OVS_LIKELY(i < (n_segs - 1))) { diff --git a/lib/dp-packet.h b/lib/dp-packet.h index a75b1c5cd..af1ff088f 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -529,6 +529,16 @@ dp_packet_inner_l3(const struct dp_packet *b) : NULL; } +static inline size_t +dp_packet_inner_l3_size(const struct dp_packet *b) +{ + return OVS_LIKELY(b->inner_l3_ofs != UINT16_MAX) + ? (const char *) dp_packet_tail(b) + - (const char *) dp_packet_inner_l3(b) + - dp_packet_l2_pad_size(b) + : 0; +} + static inline void * dp_packet_inner_l4(const struct dp_packet *b) { @@ -563,6 +573,22 @@ dp_packet_get_tcp_payload(const struct dp_packet *b) return NULL; } +static inline const void * +dp_packet_get_inner_tcp_payload(const struct dp_packet *b) +{ + size_t l4_size = dp_packet_inner_l4_size(b); + + if (OVS_LIKELY(l4_size >= TCP_HEADER_LEN)) { + struct tcp_header *tcp = dp_packet_inner_l4(b); + int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; + + if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= l4_size)) { + return (const char *) tcp + tcp_len; + } + } + return NULL; +} + static inline uint32_t dp_packet_get_tcp_payload_length(const struct dp_packet *pkt) { @@ -1320,6 +1346,14 @@ dp_packet_hwol_reset_tcp_seg(struct dp_packet *p) ol_flags |= DP_PACKET_OL_TX_IP_CKSUM; } + if (ol_flags & (DP_PACKET_OL_TX_TUNNEL_VXLAN | + DP_PACKET_OL_TX_TUNNEL_GENEVE)) { + if (ol_flags & DP_PACKET_OL_TX_OUTER_IPV4) { + ol_flags |= DP_PACKET_OL_TX_OUTER_IP_CKSUM; + } + ol_flags |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM; + } + *dp_packet_ol_flags_ptr(p) = ol_flags; } diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 0f9f07f44..22cc858e7 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -308,6 +308,14 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, if (l4_ofs != UINT16_MAX) { packet->inner_l4_ofs = l4_ofs + data->header_len; } + + if (dp_packet_hwol_is_tso(packet)) { + uint16_t tso_segsz = dp_packet_get_tso_segsz(packet); + if (tso_segsz > data->header_len) { + tso_segsz -= data->header_len; + dp_packet_set_tso_segsz(packet, tso_segsz); + } + } } static void * diff --git a/lib/netdev.c b/lib/netdev.c index f2d921ed6..1d59bbe5d 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -69,8 +69,6 @@ COVERAGE_DEFINE(netdev_received); COVERAGE_DEFINE(netdev_sent); COVERAGE_DEFINE(netdev_add_router); COVERAGE_DEFINE(netdev_get_stats); -COVERAGE_DEFINE(netdev_vxlan_tso_drops); -COVERAGE_DEFINE(netdev_geneve_tso_drops); COVERAGE_DEFINE(netdev_push_header_drops); COVERAGE_DEFINE(netdev_soft_seg_good); COVERAGE_DEFINE(netdev_soft_seg_drops); @@ -910,28 +908,23 @@ netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, struct dp_packet *packet; int error; - if (userspace_tso_enabled() && - !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { - DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { - if (dp_packet_hwol_is_tso(packet)) { - if (dp_packet_hwol_is_tunnel_vxlan(packet) - && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) { - VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support", - netdev_get_name(netdev)); - COVERAGE_INC(netdev_vxlan_tso_drops); - dp_packet_delete_batch(batch, true); - return false; + if (userspace_tso_enabled()) { + if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + return netdev_send_tso(netdev, qid, batch, concurrent_txq); } - - if (dp_packet_hwol_is_tunnel_geneve(packet) - && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) { - VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support", - netdev_get_name(netdev)); - COVERAGE_INC(netdev_geneve_tso_drops); - dp_packet_delete_batch(batch, true); - return false; + } + } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO | + NETDEV_TX_GENEVE_TNL_TSO))) { + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (!dp_packet_hwol_is_tso(packet)) { + continue; + } + if (dp_packet_hwol_is_tunnel_vxlan(packet) || + dp_packet_hwol_is_tunnel_geneve(packet)) { + return netdev_send_tso(netdev, qid, batch, concurrent_txq); } - return netdev_send_tso(netdev, qid, batch, concurrent_txq); } } } diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 3f1a15445..dd30b6bf5 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -351,6 +351,93 @@ OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - tcp over vxlan tunnel with software fallback]) +AT_SKIP_IF([test $HAVE_NC = no]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +OVS_CHECK_VXLAN() + +dnl This test is only valid with tso. If the kernel segments the packets, the +dnl packet lengths in the final test will be different. +m4_ifndef([CHECK_SYSTEM_TSO], [AT_SKIP_IF(:)]) + +OVS_TRAFFIC_VSWITCHD_START() +ADD_BR([br-underlay]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"]) + +ADD_NAMESPACES(at_ns0) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24") +AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Test the case where only one side has all checksum and tso offload disabled. +AT_CHECK([ethtool -K ovs-p0 tso off], [0], [ignore], [ignore]) +AT_CHECK([ethtool -K ovs-p0 sg off], [0], [ignore], [ignore]) + +dnl Reinitialize. +AT_CHECK([ovs-vsctl del-port ovs-p0]) +AT_CHECK([ovs-vsctl add-port br-underlay ovs-p0]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL([vxlan], [br0], [at_vxlan0], [172.31.1.1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [id 0 dstport 4789]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Check that the tunnel is up. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Start tcpdump to capture the encapsulated packets. +OVS_DAEMONIZE([tcpdump -i ovs-p0 -w p0.pcap], [tcpdump.pid]) +sleep 1 + +dnl Initialize the listener before it is needed. +NETNS_DAEMONIZE([at_ns0], [nc -l 10.1.1.1 1234 > data2], [nc.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :1234])]) + +dnl Large TCP transfer aimed towards ovs-p0, which has TSO disabled. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT 10.1.1.1 1234 < payload.bin]) + +dnl Wait until transfer completes before checking. +OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) +AT_CHECK([diff -q payload.bin data2], [0]) + +dnl Stop OVS and tcpdump and verify the results. +OVS_TRAFFIC_VSWITCHD_STOP + +AT_CHECK([kill -15 $(cat tcpdump.pid)]) + +OVS_WAIT_WHILE([kill -0 $(cat tcpdump.pid)]) + +ovs-pcap p0.pcap + +dnl The exact number of packets sent will vary, but we check that the largest segments have the correct +dnl lengths and certain other fields. +AT_CHECK([test $(ovs-pcap p0.pcap | grep -Ec dnl +"^.{24}0800"dnl Ethernet +"450005aa....4000..11....ac1f0164ac1f0101"dnl IP(len=1450, DF, UDP, 172.31.1.100->172.31.1.1) +"....12b505960000"dnl UDP(len=1430, dport=4789) +"0800000000000000"dnl VXLAN(gpid=0, vni=0) +".{24}0800"dnl Ethernet +"45000578....4000..06....0a0101640a010101"dnl IP(len=1400, DF, TCP, 10.1.1.100->10.1.1.1) +"....04d2............................0000"dnl TCP(dport=1234 +) -ge 20]) + +AT_CLEANUP + AT_SETUP([datapath - ping vlan over vxlan tunnel]) OVS_CHECK_TUNNEL_TSO() OVS_CHECK_VXLAN()