@@ -1300,6 +1300,14 @@ dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b)
*dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN;
}
+/* Clears tunnel offloading marks. */
+static inline void
+dp_packet_hwol_reset_tunnel(struct dp_packet *b)
+{
+ *dp_packet_ol_flags_ptr(b) &= ~(DP_PACKET_OL_TX_TUNNEL_VXLAN |
+ DP_PACKET_OL_TX_TUNNEL_GENEVE);
+}
+
/* Mark packet 'b' as a tunnel packet with outer IPv4 header. */
static inline void
dp_packet_hwol_set_tx_outer_ipv4(struct dp_packet *b)
@@ -115,6 +115,7 @@ COVERAGE_DEFINE(datapath_drop_lock_error);
COVERAGE_DEFINE(datapath_drop_userspace_action_error);
COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
+COVERAGE_DEFINE(datapath_drop_tunnel_tso_recirc);
COVERAGE_DEFINE(datapath_drop_recirc_error);
COVERAGE_DEFINE(datapath_drop_invalid_port);
COVERAGE_DEFINE(datapath_drop_invalid_bond);
@@ -8920,6 +8921,34 @@ static void
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
struct dp_packet_batch *packets)
{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ size_t i, size = dp_packet_batch_size(packets);
+ struct dp_packet *packet;
+
+ DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, packets) {
+ if (dp_packet_hwol_is_tunnel_geneve(packet) ||
+ dp_packet_hwol_is_tunnel_vxlan(packet)) {
+
+ if (dp_packet_hwol_is_tso(packet)) {
+ /* Can't perform GSO in the middle of a pipeline. */
+ COVERAGE_INC(datapath_drop_tunnel_tso_recirc);
+ dp_packet_delete(packet);
+ VLOG_WARN_RL(&rl, "Recirculating tunnel packets with "
+ "TSO is not supported");
+ continue;
+ }
+ /* Have to fix all the checksums before re-parsing, because the
+ * packet will be treated as having a single set of headers. */
+ dp_packet_ol_send_prepare(packet, 0);
+ /* This packet must not be marked with anything tunnel-related. */
+ dp_packet_hwol_reset_tunnel(packet);
+ /* Clear inner offsets. Other ones are collateral, but they will
+ * be re-initialized on re-parsing. */
+ dp_packet_reset_offsets(packet);
+ }
+ dp_packet_batch_refill(packets, packet, i);
+ }
+
dp_netdev_input__(pmd, packets, true, 0);
}
@@ -726,3 +726,93 @@ udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),1
OVS_VSWITCHD_STOP
AT_CLEANUP
+
+dnl This is a regression test for outer header checksum offloading
+dnl with recirculation.
+AT_SETUP([tunnel_push_pop_ipv6 - recirculation after encapsulation])
+
+OVS_VSWITCHD_START(
+ [add-port br0 p0 \
+ -- set Interface p0 type=dummy ofport_request=1 \
+ other-config:hwaddr=aa:55:aa:55:00:00])
+AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg])
+AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy])
+AT_CHECK([ovs-vsctl add-port int-br t2 \
+ -- set Interface t2 type=geneve \
+ options:remote_ip=2001:cafe::92 \
+ options:key=123 ofport_request=2])
+
+dnl Setup an IP address.
+AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/64], [0], [OK
+])
+dnl Checking that a local route for added IP was successfully installed.
+AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl
+Cached: 2001:cafe::/64 dev br0 SRC 2001:cafe::88 local
+])
+
+dnl Add a dp-hash selection group.
+AT_CHECK([ovs-ofctl add-group br0 \
+ 'group_id=1234,type=select,selection_method=dp_hash,bucket=weight=1,output:p0'])
+AT_CHECK([ovs-ofctl add-flow br0 in_port=br0,action=group:1234])
+AT_CHECK([ovs-ofctl add-flow br0 in_port=p0,action=normal])
+
+AT_CHECK([ovs-ofctl add-flow int-br action=normal])
+
+dnl This Neighbor Advertisement from p0 has two effects:
+dnl 1. The neighbor cache will learn that 2001:cafe::92 is at f8:bc:12:44:34:b6.
+dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0.
+AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl
+ 'recirc_id(0),in_port(1),dnl
+ eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),dnl
+ ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=58,tclass=0,hlimit=255,frag=no),dnl
+ icmpv6(type=136,code=0),dnl
+ nd(target=2001:cafe::92,sll=00:00:00:00:00:00,tll=f8:bc:12:44:34:b6)'
+])
+
+dnl Check that selection group is used in the trace.
+AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \
+ | grep -E 'tunnel|actions'], [0], [dnl
+ -> output to native tunnel
+ -> tunneling to 2001:cafe::92 via br0
+ -> tunneling from aa:55:aa:55:00:00 2001:cafe::88 to f8:bc:12:44:34:b6 2001:cafe::92
+Datapath actions: tnl_push(tnl_port(6081),header(size=70,type=5,dnl
+eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl
+ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl
+udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),dnl
+hash(l4(0)),recirc(0x1)
+])
+
+dnl Now check that the packet is actually encapsulated and delivered.
+AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap])
+
+packet=50540000000a5054000000091234
+eth=f8bc124434b6aa55aa55000086dd
+ip6=60000000001e11402001cafe0000000000000000000000882001cafe000000000000000000000092
+dnl Source port is based on a packet hash, so it may differ depending on the
+dnl compiler flags and CPU type. Same for UDP checksum. Masked with '....'.
+udp=....17c1001e....
+geneve=0000655800007b00
+encap=${eth}${ip6}${udp}${geneve}
+dnl Output to tunnel from a int-br internal port.
+dnl Checking that the packet arrived and it was correctly encapsulated.
+AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"])
+OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1])
+dnl Sending again to exercise the non-miss upcall path.
+AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"])
+OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2])
+
+dnl Finally, checking that the datapath flow is also correct.
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \
+ | strip_ufid | strip_used], [0], [dnl
+recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl
+eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl
+packets:1, bytes:14, used:0.0s, dnl
+actions:tnl_push(tnl_port(6081),header(size=70,type=5,dnl
+eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl
+ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl
+udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),dnl
+hash(l4(0)),recirc(0x2)
+])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
@@ -1163,3 +1163,92 @@ gre((flags=0x0,proto=0x6558))),out_port(2)),1
OVS_VSWITCHD_STOP
AT_CLEANUP
+
+dnl This is a regression test for outer header checksum offloading
+dnl with recirculation.
+AT_SETUP([tunnel_push_pop - recirculation after encapsulation])
+
+OVS_VSWITCHD_START(
+ [add-port br0 p0 \
+ -- set Interface p0 type=dummy ofport_request=1 \
+ other-config:hwaddr=aa:55:aa:55:00:00])
+AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg])
+AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy])
+AT_CHECK([ovs-vsctl add-port int-br t2 \
+ -- set Interface t2 type=geneve \
+ options:remote_ip=1.1.2.92 \
+ options:key=123 ofport_request=2])
+
+dnl Setup an IP address.
+AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK
+])
+dnl Checking that a local route for added IP was successfully installed.
+AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl
+Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local
+])
+
+dnl Add a dp-hash selection group.
+AT_CHECK([ovs-ofctl add-group br0 \
+ 'group_id=1234,type=select,selection_method=dp_hash,bucket=weight=1,output:p0'])
+AT_CHECK([ovs-ofctl add-flow br0 in_port=br0,action=group:1234])
+AT_CHECK([ovs-ofctl add-flow br0 in_port=p0,action=normal])
+
+AT_CHECK([ovs-ofctl add-flow int-br action=normal])
+
+dnl This ARP reply from p0 has two effects:
+dnl 1. The ARP cache will learn that 1.1.2.92 is at f8:bc:12:44:34:b6.
+dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0.
+AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl
+ 'recirc_id(0),in_port(1),dnl
+ eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),dnl
+ arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)'
+])
+
+dnl Check that selection group is used in the trace.
+AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \
+ | grep -E 'tunnel|actions'], [0], [dnl
+ -> output to native tunnel
+ -> tunneling to 1.1.2.92 via br0
+ -> tunneling from aa:55:aa:55:00:00 1.1.2.88 to f8:bc:12:44:34:b6 1.1.2.92
+Datapath actions: tnl_push(tnl_port(6081),header(size=50,type=5,dnl
+eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl
+ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl
+udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),dnl
+hash(l4(0)),recirc(0x1)
+])
+
+dnl Now check that the packet is actually encapsulated and delivered.
+AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap])
+
+packet=50540000000a5054000000091234
+eth=f8bc124434b6aa55aa5500000800
+ip4=450000320000400040113406010102580101025c
+dnl Source port is based on a packet hash, so it may differ depending on the
+dnl compiler flags and CPU type. Masked with '....'.
+udp=....17c1001e0000
+geneve=0000655800007b00
+encap=${eth}${ip4}${udp}${geneve}
+dnl Output to tunnel from a int-br internal port.
+dnl Checking that the packet arrived and it was correctly encapsulated.
+AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"])
+OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1])
+
+dnl Sending again to exercise the non-miss upcall path.
+AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"])
+OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2])
+
+dnl Finally, checking that the datapath flow is also correct.
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \
+ | strip_ufid | strip_used], [0], [dnl
+recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl
+eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl
+packets:1, bytes:14, used:0.0s, dnl
+actions:tnl_push(tnl_port(6081),header(size=50,type=5,dnl
+eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl
+ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl
+udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),dnl
+hash(l4(0)),recirc(0x2)
+])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
Recirculation involves re-parsing the packet from scratch and that process is not aware of multiple header levels nor the inner/outer offsets. So, it overwrites offsets with new ones from the outermost headers and sets offloading flags that change their meaning when the packet is marked for tunnel offloading. For example: 1. TCP packet enters OVS. 2. TCP packet gets encapsulated into UDP tunnel. 3. Recirculation happens. 4. Packet is re-parsed after recirculation with miniflow_extract() or similar function. 5. Packet is marked for UDP checksumming because we parse the outermost set of headers. But since it is tunneled, it means inner UDP checksumming. And that makes no sense, because the inner packet is TCP. This is causing packet drops due to malformed packets or even assertions and crashes in the code that is trying to fixup checksums for packets using incorrect metadata: SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior lib/packets.c:2061:15: runtime error: member access within null pointer of type 'struct udp_header' 0 0xbe5221 in packet_udp_complete_csum lib/packets.c:2061:15 1 0x7e5662 in dp_packet_ol_send_prepare lib/dp-packet.c:638:9 2 0x96ef89 in netdev_send lib/netdev.c:940:9 3 0x818e94 in dp_netdev_pmd_flush_output_on_port lib/dpif-netdev.c:5577:9 4 0x817606 in dp_netdev_pmd_flush_output_packets lib/dpif-netdev.c:5618:27 5 0x81cfa5 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5677:9 6 0x7eefe4 in dpif_netdev_run lib/dpif-netdev.c:7001:25 7 0x610e87 in type_run ofproto/ofproto-dpif.c:367:9 8 0x5b9e80 in ofproto_type_run ofproto/ofproto.c:1879:31 9 0x55bbb4 in bridge_run__ vswitchd/bridge.c:3281:9 10 0x558b6b in bridge_run vswitchd/bridge.c:3346:5 11 0x591dc5 in main vswitchd/ovs-vswitchd.c:130:9 12 0x172b89 in __libc_start_call_main (/lib64/libc.so.6+0x27b89) 13 0x172c4a in __libc_start_main@GLIBC_2.2.5 (/lib64/libc.so.6+0x27c4a) 14 0x47eff4 in _start (vswitchd/ovs-vswitchd+0x47eff4) Tests added for both IPv4 and IPv6 cases. Though IPv6 test doesn't trigger the issue it's better to have a symmetric test. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2024-March/053014.html Signed-off-by: Ilya Maximets <i.maximets@ovn.org> --- An alternative approach is also being discussed in the thread where the issue was reported. lib/dp-packet.h | 8 ++++ lib/dpif-netdev.c | 29 +++++++++++ tests/tunnel-push-pop-ipv6.at | 90 +++++++++++++++++++++++++++++++++++ tests/tunnel-push-pop.at | 89 ++++++++++++++++++++++++++++++++++ 4 files changed, 216 insertions(+)