@@ -104,6 +104,8 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpctl.h \
lib/dp-packet.h \
lib/dp-packet.c \
+ lib/dp-packet-gso.h \
+ lib/dp-packet-gso.c \
lib/dpdk.h \
lib/dpif-netdev-lookup.h \
lib/dpif-netdev-lookup.c \
new file mode 100644
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2021 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <config.h>
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "coverage.h"
+#include "csum.h"
+#include "dp-packet.h"
+#include "dp-packet-gso.h"
+#include "dpif-netdev.h"
+#include "openvswitch/compiler.h"
+#include "openvswitch/dynamic-string.h"
+#include "openvswitch/vlog.h"
+#include "packets.h"
+#include "util.h"
+
+VLOG_DEFINE_THIS_MODULE(dp_packet_gso);
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
+/* Update ip header's total len, and id and update tcp header's
+ * sent sequence number. In the end, update ip and tcp csum.
+ */
+static void
+update_ipv4_tcp_headers(const struct dp_packet *src, struct dp_packet **pkts,
+ uint16_t nb_segs)
+{
+ struct tcp_header *tcp;
+ struct ip_header *ip;
+ struct dp_packet *p;
+ uint32_t tcp_seq;
+ uint16_t ipid;
+ int i;
+
+ ip = dp_packet_l3(src);
+ ipid = ntohs(ip->ip_id);
+ tcp = dp_packet_l4(src);
+ tcp_seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+
+ for (i = 0; i < nb_segs; i++) {
+ p = pkts[i];
+
+ ip = dp_packet_l3(p);
+ ip->ip_tot_len = htons(dp_packet_l3_size(p));
+ ip->ip_id = htons(ipid);
+ ip->ip_csum = 0;
+ ip->ip_csum = csum(ip, sizeof *ip);
+
+ tcp = dp_packet_l4(p);
+ put_16aligned_be32(&tcp->tcp_seq, htonl(tcp_seq));
+ packet_csum_tcpudp(p);
+
+ ipid += 1;
+ tcp_seq += (const char *) dp_packet_tail(p) -
+ (const char *) dp_packet_l4(p) -
+ TCP_OFFSET(tcp->tcp_ctl) * 4;
+ }
+}
+
+static void
+hdr_segment_init(struct dp_packet *dst, const struct dp_packet *src)
+{
+ /* Copy the following fields into the returned buffer: l2_pad_size,
+ * l2_5_ofs, l3_ofs, l4_ofs, cutlen, packet_type and md. */
+ memcpy(&dst->l2_pad_size, &src->l2_pad_size,
+ sizeof(struct dp_packet) -
+ offsetof(struct dp_packet, l2_pad_size));
+
+ *dp_packet_ol_flags_ptr(dst) = 0;
+}
+
+static int
+gso_do_segment(const struct dp_packet *p, uint16_t hdr_offset,
+ uint16_t pyld_unit_size, struct dp_packet **pout,
+ uint16_t nb_pout)
+{
+ uint16_t nb_segs = 0;
+ struct dp_packet *pkt;
+ uint16_t seg_size;
+ uint16_t pos = hdr_offset;
+ int bytes_remaining = dp_packet_size(p) - hdr_offset;
+
+ while (bytes_remaining > 0) {
+
+ seg_size = (bytes_remaining >= pyld_unit_size) ?
+ pyld_unit_size : bytes_remaining;
+
+ /* Create a new dp_packet, put payload, push header. */
+ pkt = dp_packet_new_with_headroom(seg_size, hdr_offset);
+ hdr_segment_init(pkt, p);
+ dp_packet_put(pkt, (char *) dp_packet_data(p) + pos, seg_size);
+ dp_packet_push(pkt, dp_packet_data(p), hdr_offset);
+
+ pos += seg_size;
+ bytes_remaining -= seg_size;
+ pout[nb_segs] = pkt;
+ nb_segs++;
+
+ if (nb_segs > nb_pout) {
+ VLOG_WARN_RL(&rl, "Not enough memory to process GSO.");
+ nb_segs = -1;
+ /* need to free dp_packet. */
+ break;
+ }
+ }
+ return nb_segs;
+}
+
+int
+gso_tcp4_segment(struct dp_packet *p, uint16_t gso_size,
+ struct dp_packet **pout, uint16_t nb_pout)
+{
+ uint16_t pyld_unit_size, hdr_offset;
+ int nb_segs;
+
+ hdr_offset = (char *) dp_packet_get_tcp_payload(p) -
+ (char *) dp_packet_eth(p);
+ pyld_unit_size = gso_size - hdr_offset;
+
+ if (OVS_UNLIKELY(dp_packet_size(p) < ETH_PAYLOAD_MAX)) {
+ VLOG_WARN_RL(&rl, "Packet size %u bytes too small for GSO.",
+ dp_packet_size(p));
+ return -EINVAL;
+ }
+
+ nb_segs = gso_do_segment(p, hdr_offset, pyld_unit_size, pout, nb_pout);
+ if (nb_segs > 0) {
+ /* Update TCP checksum. */
+ update_ipv4_tcp_headers(p, pout, nb_segs);
+ }
+
+ return nb_segs;
+}
new file mode 100644
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DP_PACKET_GSO_H
+#define DP_PACKET_GSO_H 1
+
+#include <stdint.h>
+#include <stdbool.h>
+
+int gso_tcp4_segment(struct dp_packet *p, uint16_t gso_size,
+ struct dp_packet **pouts, uint16_t nb_pouts);
+int gso_udp4_segment(struct dp_packet *p, uint16_t gso_size,
+ struct dp_packet **pouts, uint16_t nb_pouts);
+#endif /* dp-packet-gso.h */
@@ -47,6 +47,7 @@
#include "ovs-numa.h"
#include "packets.h"
#include "socket-util.h"
+#include "userspace-tso.h"
#include "util.h"
#ifndef SOL_XDP
@@ -867,6 +868,7 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
FRAME_SIZE - FRAME_HEADROOM,
OVS_XDP_HEADROOM);
dp_packet_set_size(packet, len);
+ *dp_packet_ol_flags_ptr(packet) = 0;
/* Add packet into batch, increase batch->count. */
dp_packet_batch_add(batch, packet);
@@ -1187,6 +1189,10 @@ netdev_afxdp_construct(struct netdev *netdev)
dev->xsks = NULL;
dev->tx_locks = NULL;
+ if (userspace_tso_enabled()) {
+ netdev->ol_flags = 0;
+ }
+
netdev_request_reconfigure(netdev);
return 0;
}
@@ -34,6 +34,7 @@
#include "cmap.h"
#include "coverage.h"
#include "dpif.h"
+#include "dp-packet-gso.h"
#include "dp-packet.h"
#include "openvswitch/dynamic-string.h"
#include "fatal-signal.h"
@@ -797,7 +798,6 @@ netdev_send_prepare_packet(const uint64_t netdev_flags,
if (dp_packet_hwol_is_tso(packet)
&& !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
/* Fall back to GSO in software. */
- VLOG_ERR_BUF(errormsg, "No TSO support");
return false;
}
@@ -806,8 +806,8 @@ netdev_send_prepare_packet(const uint64_t netdev_flags,
if (dp_packet_hwol_l4_is_tcp(packet)) {
if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
/* Fall back to TCP csum in software. */
- VLOG_ERR_BUF(errormsg, "No TCP checksum support");
- return false;
+ packet_csum_tcpudp(packet);
+ return true;
}
} else if (dp_packet_hwol_l4_is_udp(packet)) {
if (!(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) {
@@ -835,7 +835,8 @@ netdev_send_prepare_packet(const uint64_t netdev_flags,
* otherwise either fall back to software implementation or drop it. */
static void
netdev_send_prepare_batch(const struct netdev *netdev,
- struct dp_packet_batch *batch)
+ struct dp_packet_batch *batch,
+ struct dp_packet_batch *gso_batch)
{
struct dp_packet *packet;
size_t i, size = dp_packet_batch_size(batch);
@@ -846,11 +847,16 @@ netdev_send_prepare_batch(const struct netdev *netdev,
if (netdev_send_prepare_packet(netdev->ol_flags, packet, &errormsg)) {
dp_packet_batch_refill(batch, packet, i);
} else {
- dp_packet_delete(packet);
- COVERAGE_INC(netdev_send_prepare_drops);
- VLOG_WARN_RL(&rl, "%s: Packet dropped: %s",
- netdev_get_name(netdev), errormsg);
- free(errormsg);
+ if (dp_packet_hwol_is_tso(packet) &&
+ !(netdev->ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
+ dp_packet_batch_add(gso_batch, packet);
+ } else {
+ dp_packet_delete(packet);
+ COVERAGE_INC(netdev_send_prepare_drops);
+ VLOG_WARN_RL(&rl, "%s: Packet dropped: %s",
+ netdev_get_name(netdev), errormsg);
+ free(errormsg);
+ }
}
}
}
@@ -884,17 +890,67 @@ int
netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
bool concurrent_txq)
{
- int error;
+ struct dp_packet_batch *gso_batch_ptr;
+ struct dp_packet_batch gso_batch;
+ struct dp_packet **gso_pkts;
+ struct dp_packet *packet;
+ uint16_t gso_pkts_len, nb_segs;
+ int error = 0;
- netdev_send_prepare_batch(netdev, batch);
- if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) {
- return 0;
+ dp_packet_batch_init(&gso_batch);
+ netdev_send_prepare_batch(netdev, batch, &gso_batch);
+
+ if (!dp_packet_batch_is_empty(batch)) {
+ error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq);
+ if (!error) {
+ COVERAGE_INC(netdev_sent);
+ }
}
- error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq);
- if (!error) {
- COVERAGE_INC(netdev_sent);
+ if (dp_packet_batch_is_empty(&gso_batch)) {
+ return error;
}
+ gso_batch_ptr = &gso_batch;
+ DP_PACKET_BATCH_FOR_EACH (i, packet, gso_batch_ptr) {
+ struct dp_packet_batch seg_batch;
+ uint16_t gso_size = 1000; /* How to decide gso_size? */
+
+ gso_pkts_len = 2 * NETDEV_MAX_BURST;
+ gso_pkts = xmalloc(gso_pkts_len * sizeof(struct dp_packet *));
+
+ nb_segs = gso_tcp4_segment(packet, gso_size, gso_pkts, gso_pkts_len);
+ if (nb_segs <= 0) {
+ VLOG_WARN("GSO tcp4 segment failed");
+ dp_packet_delete_batch(gso_batch_ptr, true);
+ return EINVAL;
+ }
+ dp_packet_batch_init(&seg_batch);
+
+ for (i = 0; i < nb_segs; i++) {
+ dp_packet_batch_add(&seg_batch, gso_pkts[i]);
+
+ if (dp_packet_batch_is_full(&seg_batch)) {
+ /* Send the first batch when full. */
+ error = netdev->netdev_class->send(netdev, qid, &seg_batch,
+ concurrent_txq);
+ if (!error) {
+ COVERAGE_INC(netdev_sent);
+ }
+ dp_packet_batch_init(&seg_batch);
+ }
+ }
+ if (!dp_packet_batch_is_empty(&seg_batch)) {
+ /* Send the rest. */
+ error = netdev->netdev_class->send(netdev, qid, &seg_batch,
+ concurrent_txq);
+ if (!error) {
+ COVERAGE_INC(netdev_sent);
+ }
+ }
+
+ }
+ free(gso_pkts);
+
return error;
}
@@ -1887,3 +1887,38 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6)
}
}
}
+
+void
+packet_csum_tcpudp(struct dp_packet *p)
+{
+ struct eth_header *eth;
+ struct ip_header *ip;
+ struct tcp_header *tcp;
+ struct udp_header *udp;
+ uint32_t pseudo_hdr_csum;
+ uint8_t l4proto;
+ size_t l4_size;
+
+ eth = dp_packet_eth(p);
+ if (eth->eth_type != htons(ETH_TYPE_IP)) {
+ return;
+ }
+
+ ip = dp_packet_l3(p);
+ l4proto = ip->ip_proto;
+ l4_size = dp_packet_l4_size(p);
+
+ if (l4proto == IPPROTO_TCP) {
+ pseudo_hdr_csum = packet_csum_pseudoheader(ip);
+ tcp = dp_packet_l4(p);
+ tcp->tcp_csum = 0;
+ tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
+ tcp, l4_size));
+ } else if (l4proto == IPPROTO_UDP) {
+ pseudo_hdr_csum = packet_csum_pseudoheader(ip);
+ udp = dp_packet_l4(p);
+ udp->udp_csum = 0;
+ udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
+ udp, l4_size));
+ }
+}
@@ -1635,6 +1635,7 @@ void packet_put_ra_prefix_opt(struct dp_packet *,
const ovs_be128 router_prefix);
uint32_t packet_csum_pseudoheader(const struct ip_header *);
void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6);
+void packet_csum_tcpudp(struct dp_packet *p);
#define DNS_HEADER_LEN 12
struct dns_header {
@@ -45,3 +45,35 @@ NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0],
OVS_TRAFFIC_VSWITCHD_STOP
AT_CLEANUP
+
+dnl p0 at at_ns0 sends TSO packet to ovs-p0 at OVS.
+dnl ovs-p1 attached to OVS as type=afxdp
+AT_SETUP([AF_XDP - enable userspace TSO])
+AT_KEYWORDS([afxdp tso])
+OVS_TRAFFIC_VSWITCHD_START()
+
+AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:userspace-tso-enable=true])
+AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+dnl Create and add ovs-p0 as system port
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
+AT_CHECK([ovs-vsctl del-port ovs-p0])
+AT_CHECK([ovs-vsctl add-port br0 ovs-p0])
+dnl Enable tx offload at p0, so ovs-p0 sees TSO packets
+NS_CHECK_EXEC([at_ns0], [ethtool -K p0 tx on > /dev/null 2>&1])
+
+dnl Create and add ovs-p1 as afxdp port
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Send a TSO from ns0 to ns1
+NETNS_DAEMONIZE([at_ns1], [iperf -s], [iperf.pid])
+NS_CHECK_EXEC([at_ns0], [iperf -c 10.1.1.2 -t1 1> /dev/null], [0])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
This patch adds GSO support for IPv4 TCP, when userspace-tso is enabled. Tested using veth sending a TSO packet to OVS, segments to smaller TCP segment, and forward to netdev-afxdp port at another namespace. Future work includes: 1. GSO for UDP, and IPv6 TCP/UDP GSO. 2. Tunnel GSO: VxLan GSO, Geneve GSO, GRE GSO... Tested using $ make check-afxdp TESTSUITEFLAGS='3' Or script below: ovs-vsctl set Open_vSwitch . other_config:userspace-tso-enable=true ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev ip netns add at_ns0 ip link add p0 type veth peer name afxdp-p0 ip link set p0 netns at_ns0 ip link set dev afxdp-p0 up ovs-vsctl add-port br0 afxdp-p0 ip netns exec at_ns0 sh << NS_EXEC_HEREDOC ip addr add "10.1.1.1/24" dev p0 ip link set dev p0 up NS_EXEC_HEREDOC ip netns add at_ns1 ip link add p1 type veth peer name afxdp-p1 ip link set p1 netns at_ns1 ip link set dev afxdp-p1 up ovs-vsctl add-port br0 afxdp-p1 -- set int afxdp-p1 type=afxdp ip netns exec at_ns1 sh << NS_EXEC_HEREDOC ip addr add "10.1.1.2/24" dev p1 ip link set dev p1 up NS_EXEC_HEREDOC ip netns exec at_ns0 ping -c 3 -i .2 10.1.1.2 ip netns exec at_ns1 ethtool -K p1 tx off ip netns exec at_ns1 iperf -s ip netns exec at_ns0 iperf -c 10.1.1.2 -t1 Tested-at: https://github.com/williamtu/ovs-travis/actions/runs/553156643 Signed-off-by: William Tu <u9012063@gmail.com> --- lib/automake.mk | 2 + lib/dp-packet-gso.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/dp-packet-gso.h | 27 +++++++++ lib/netdev-afxdp.c | 6 ++ lib/netdev.c | 88 +++++++++++++++++++++++------ lib/packets.c | 35 ++++++++++++ lib/packets.h | 1 + tests/system-afxdp.at | 32 +++++++++++ 8 files changed, 324 insertions(+), 16 deletions(-) create mode 100644 lib/dp-packet-gso.c create mode 100644 lib/dp-packet-gso.h