@@ -6053,6 +6053,13 @@ build_ls_stateful_rec_pre_acls(
*
* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
* it to conntrack for tracking and defragmentation. */
+
+ /* We do not want icmp type=3 code=4 (packet too big) to go to ct */
+ ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110,
+ "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
+ " (ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
+ " flags.tunnel_rx == 1",
+ "next;", lflow_ref);
ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 100, "ip",
REGBIT_CONNTRACK_DEFRAG" = 1; next;",
lflow_ref);
@@ -6181,6 +6188,12 @@ build_pre_lb(struct ovn_datapath *od, const struct shash *meter_groups,
ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
"nd || nd_rs || nd_ra || mldv1 || mldv2",
"next;", lflow_ref);
+ /* Do not send icmp packet too big to conntrack in ingress */
+ ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
+ "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
+ "(ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
+ " flags.tunnel_rx == 1",
+ "next;", lflow_ref);
/* Do not send service monitor packets to conntrack. */
ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
@@ -933,7 +933,17 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | gre
AT_CLEANUP
-AT_SETUP([ovn multinode pmtu - logical switch - geneve])
+m4_define([PMTUD_SWITCH_TESTS],
+ [
+ AT_SETUP([ovn multinode pmtu - logical switch - $1])
+ encap=$1
+ if test "$encap" = "vxlan"; then
+ encap_sys="vxlan_sys"
+ overhead=50
+ else
+ encap_sys="genev_sys"
+ overhead=58
+ fi
# Check that ovn-fake-multinode setup is up and running
check_fake_multinode_setup
@@ -947,12 +957,12 @@ m_as ovn-chassis-2 ip link del sw0p2-p
# Reset geneve tunnels
for c in ovn-chassis-1 ovn-chassis-2 ovn-gw-1
do
- m_as $c ovs-vsctl set open . external-ids:ovn-encap-type=geneve
+ m_as $c ovs-vsctl set open . external-ids:ovn-encap-type=$encap
done
-OVS_WAIT_UNTIL([m_as ovn-chassis-1 ip link show | grep -q genev_sys])
-OVS_WAIT_UNTIL([m_as ovn-chassis-2 ip link show | grep -q genev_sys])
-OVS_WAIT_UNTIL([m_as ovn-gw-1 ip link show | grep -q genev_sys])
+OVS_WAIT_UNTIL([m_as ovn-chassis-1 ip link show | grep -q $encap_sys])
+OVS_WAIT_UNTIL([m_as ovn-chassis-2 ip link show | grep -q $encap_sys])
+OVS_WAIT_UNTIL([m_as ovn-gw-1 ip link show | grep -q $encap_sys])
# Test East-West switching
check multinode_nbctl ls-add sw0
@@ -1008,7 +1018,8 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.4 | F
# Change ptmu for the geneve tunnel
m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1
-M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 10.0.0.4 2>&1 | grep -q "message too long, mtu=1142"])
+mtu=$((1200 - overhead))
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 10.0.0.4 2>&1 | grep -q "message too long, mtu=$mtu"])
M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
@@ -1021,16 +1032,22 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 20.0.0.3 | F
# Change ptmu for the geneve tunnel
m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1100 dev eth1
-M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 | grep -q "message too long, mtu=1042"])
+mtu=$((1100 - overhead))
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 | grep -q "message too long, mtu=$mtu"])
M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
+mtu=$((1000 - overhead))
for i in $(seq 30); do
M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom | nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
done
-M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 942'])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q "mtu $mtu"])
AT_CLEANUP
+])
+
+PMTUD_SWITCH_TESTS(["geneve"])
+PMTUD_SWITCH_TESTS(["vxlan"])
@@ -4605,6 +4605,7 @@ check_stateful_flows() {
AT_CHECK([grep "ls_in_pre_lb" sw0flows | ovn_strip_lflows], [0], [dnl
table=??(ls_in_pre_lb ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_lb ), priority=100 , match=(ip), action=(reg0[[2]] = 1; next;)
+ table=??(ls_in_pre_lb ), priority=110 , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) ||(ip6 && icmp6.type == 2 && icmp6.code == 0))), action=(next;)
table=??(ls_in_pre_lb ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_lb ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_lb ), priority=110 , match=(ip && inport == "sw0-lr0"), action=(next;)
@@ -4675,6 +4676,7 @@ AT_CAPTURE_FILE([sw0flows])
AT_CHECK([grep "ls_in_pre_lb" sw0flows | ovn_strip_lflows], [0], [dnl
table=??(ls_in_pre_lb ), priority=0 , match=(1), action=(next;)
+ table=??(ls_in_pre_lb ), priority=110 , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) ||(ip6 && icmp6.type == 2 && icmp6.code == 0))), action=(next;)
table=??(ls_in_pre_lb ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_lb ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_lb ), priority=110 , match=(ip && inport == "sw0-lr0"), action=(next;)
@@ -8258,6 +8260,7 @@ AT_CHECK([ovn-sbctl dump-flows | grep -E "ls_.*_acl" | ovn_strip_lflows], [0], [
table=??(ls_in_acl_hint ), priority=7 , match=(ct.new && !ct.est), action=(reg0[[7]] = 1; reg0[[9]] = 1; next;)
table=??(ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_acl ), priority=100 , match=(ip), action=(reg0[[0]] = 1; next;)
+ table=??(ls_in_pre_acl ), priority=110 , match=((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -8442,6 +8445,7 @@ AT_CHECK([ovn-sbctl dump-flows | grep -E "ls_.*_acl" | ovn_strip_lflows], [0], [
table=??(ls_in_acl_hint ), priority=7 , match=(ct.new && !ct.est), action=(reg0[[7]] = 1; reg0[[9]] = 1; next;)
table=??(ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_acl ), priority=100 , match=(ip), action=(reg0[[0]] = 1; next;)
+ table=??(ls_in_pre_acl ), priority=110 , match=((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -8624,6 +8628,7 @@ AT_CHECK([ovn-sbctl dump-flows | grep -E "ls_.*_acl" | ovn_strip_lflows], [0], [
table=??(ls_in_acl_hint ), priority=7 , match=(ct.new && !ct.est), action=(reg0[[7]] = 1; reg0[[9]] = 1; next;)
table=??(ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_acl ), priority=100 , match=(ip), action=(reg0[[0]] = 1; next;)
+ table=??(ls_in_pre_acl ), priority=110 , match=((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -11772,6 +11777,7 @@ check ovn-nbctl acl-add pg_dgw to-lport 1003 "outport == @pg_dgw && ip4" allow-r
AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
table=??(ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_acl ), priority=100 , match=(ip), action=(reg0[[0]] = 1; next;)
+ table=??(ls_in_pre_acl ), priority=110 , match=((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(ip && inport == "S1-R1"), action=(next;)
@@ -11789,6 +11795,7 @@ check ovn-nbctl --wait=sb lsp-set-options S1-R1 router-port=R1-S1 enable_router_
AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
table=??(ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_acl ), priority=100 , match=(ip), action=(reg0[[0]] = 1; next;)
+ table=??(ls_in_pre_acl ), priority=110 , match=((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -11817,6 +11824,7 @@ check ovn-nbctl --wait=sb lsp-set-options S1-R1 router-port=R1-S1 enable_router_
AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
table=??(ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_acl ), priority=100 , match=(ip), action=(reg0[[0]] = 1; next;)
+ table=??(ls_in_pre_acl ), priority=110 , match=((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(ip && inport == "S1-R1"), action=(next;)
@@ -11834,6 +11842,7 @@ check ovn-nbctl --wait=sb lsp-set-options S1-R1 router-port=R1-S1 enable_router_
AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
table=??(ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_acl ), priority=100 , match=(ip), action=(reg0[[0]] = 1; next;)
+ table=??(ls_in_pre_acl ), priority=110 , match=((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -11848,6 +11857,7 @@ check ovn-nbctl --wait=sb lsp-set-options S1-R1 router-port=R1-S1
AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
table=??(ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=??(ls_in_pre_acl ), priority=100 , match=(ip), action=(reg0[[0]] = 1; next;)
+ table=??(ls_in_pre_acl ), priority=110 , match=((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(eth.mcast), action=(next;)
table=??(ls_in_pre_acl ), priority=110 , match=(ip && inport == "S1-R1"), action=(next;)
A) routed traffic and l3gw B) non routed traffic and vxlan. Routed traffic and l3gw ======================= Configuration: p1 - LS1 - Router - LS2 - p2, with p1 on hv1, p2 on hv2 and Router a l3gw chassis on hv2. If p1 sends an over-mtu packet towards p2, this will cause a icmp_type=3,icmp_code=4 being received from the tunnel interface. It will be received back in LS1 from LS1-router (l3gateway) port. If some ACLs are present, icmp was sent to ct, resulting in ct_inv and packet being dropped. Before this patch and patch [1], l3gateway ports were ** sometimes ** considered as related lports. There was no code adding such ports to related_lports ssets, but L3gateway ports are sometimes created as patch ports (which are related_lports) and then modified to l3gateway ports. Until patch [1], such a port type modification did not cause the port to be removed from the sset. Hence, remote l3gateway ports were sometimes considered as related ports, and proper flows were installed to skip ct. This means that before this patch and before patch [1] - After a recompute, "remote" l3gateway ports were not considered as related ports and packet being dropped in above use case. - in I+P, if the l3gateway port was directly created (i.e. in one sb transaction, w/o going through a patch port), it was also not in the related_lports set, resulting in a similar packet drop in the above use case. Patch [1] always consideres "remote" l3gateway ports were not related_ports, in both I+P and recompute cases. In that sense, it made the behaviour more consistent, icmp type=3, code=4 was always dropped in this scenario. With patch [1] and this patch, remote l3gateway ports are still non related ports, but ct is skipped for icmp type=3, code=4. Non routed traffic and vxlan. ============================= Configuration: p1 - S1 - p2 with p1 on hv1 and p2 on hv2, and vxlan tunnel. If p1 sends ovn-mtu packet towards p2, an icmp 'packet too big' (type=3, code=4), generated by the kernel, is received on the tunnel interface. It's being handled by table CT_ZONE_LOOKUP with outport=p2 and inport=0. Hence table CT_ZONE_LOOKUP will not load any ct_zone. Later on this packet is sent to conntrack, resulting in ct_inv flag set and packet being dropped. There is no need to send such a icmp 'packet too big' to ct in ingress datapath. This patch adds two flows to skip ct for such a packet. [1] a680c96465cd Fixes: a680c96465cd ("controller: Nonvif related lports handling.") Fixes: 3faadc76ad71 ("northd: Fix pmtud for non routed traffic.") Reported-at: https://issues.redhat.com/browse/FDP-685 Signed-off-by: Xavier Simonart <xsimonar@redhat.com> --- northd/northd.c | 13 +++++++++++++ tests/multinode.at | 33 +++++++++++++++++++++++++-------- tests/ovn-northd.at | 10 ++++++++++ 3 files changed, 48 insertions(+), 8 deletions(-)