diff mbox series

[ovs-dev,v2,7/7] northd: Fix pmtud related issues.

Message ID 20240723155444.2530187-8-xsimonar@redhat.com
State Accepted
Delegated to: Numan Siddique
Headers show
Series pmtud and related ports. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes fail github build: failed

Commit Message

Xavier Simonart July 23, 2024, 3:54 p.m. UTC
A) routed traffic and l3gw
B) non routed traffic and vxlan.

Routed traffic and l3gw
=======================
Configuration: p1 - LS1 - Router - LS2 - p2, with p1 on hv1, p2 on hv2 and Router
a l3gw chassis on hv2.
If p1 sends an over-mtu packet towards p2, this will cause a icmp_type=3,icmp_code=4
being received from the tunnel interface. It will be received back in LS1 from
LS1-router (l3gateway) port.
If some ACLs are present, icmp was sent to ct, resulting in ct_inv and packet being dropped.

Before this patch and patch [1], l3gateway ports were ** sometimes ** considered as
related lports. There was no code adding such ports to related_lports ssets, but
L3gateway ports are sometimes created as patch ports (which are related_lports)
and then modified to l3gateway ports.
Until patch [1], such a port type modification did not cause the port to be removed
from the sset. Hence, remote l3gateway ports were sometimes considered as related ports,
and proper flows were installed to skip ct.

This means that before this patch and before patch [1]
- After a recompute, "remote" l3gateway ports were not considered as related
  ports and packet being dropped in above use case.
- in I+P, if the l3gateway port was directly created (i.e. in one sb
  transaction, w/o going through a patch port), it was also not in the
  related_lports set, resulting in a similar packet drop in the above use case.

Patch [1] always consideres "remote" l3gateway ports were not related_ports, in both
I+P and recompute cases. In that sense, it made the behaviour more consistent, icmp
type=3, code=4 was always dropped in this scenario.

With patch [1] and this patch, remote l3gateway ports are still non related ports, but ct
is skipped for icmp type=3, code=4.

Non routed traffic and vxlan.
=============================
Configuration:  p1 - S1 - p2 with p1 on hv1 and p2 on hv2, and vxlan tunnel.

If p1 sends ovn-mtu packet towards p2, an icmp 'packet too big' (type=3, code=4),
generated by the kernel, is received on the tunnel interface. It's being handled
by table CT_ZONE_LOOKUP with outport=p2 and inport=0. Hence table CT_ZONE_LOOKUP
will not load any ct_zone.
Later on this packet is sent to conntrack, resulting in ct_inv flag set and
packet being dropped.
There is no need to send such a icmp 'packet too big' to ct in ingress datapath.
This patch adds two flows to skip ct for such a packet.

[1] a680c96465cd

Fixes: a680c96465cd ("controller: Nonvif related lports handling.")
Fixes: 3faadc76ad71 ("northd: Fix pmtud for non routed traffic.")
Reported-at: https://issues.redhat.com/browse/FDP-685

Signed-off-by: Xavier Simonart <xsimonar@redhat.com>

---
v2: updated failing test case.
---
 northd/northd.c     | 13 +++++++++++++
 tests/multinode.at  | 33 +++++++++++++++++++++++++--------
 tests/ovn-northd.at | 10 ++++++++++
 3 files changed, 48 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/northd/northd.c b/northd/northd.c
index fe7753f82..f516effd1 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -6053,6 +6053,13 @@  build_ls_stateful_rec_pre_acls(
          *
          * 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
          * it to conntrack for tracking and defragmentation. */
+
+        /* We do not want icmp type=3 code=4 (packet too big) to go to ct */
+        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110,
+                      "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
+                      " (ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
+                      " flags.tunnel_rx == 1",
+                      "next;", lflow_ref);
         ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 100, "ip",
                       REGBIT_CONNTRACK_DEFRAG" = 1; next;",
                       lflow_ref);
@@ -6181,6 +6188,12 @@  build_pre_lb(struct ovn_datapath *od, const struct shash *meter_groups,
     ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
                   "nd || nd_rs || nd_ra || mldv1 || mldv2",
                   "next;", lflow_ref);
+    /* Do not send icmp packet too big to conntrack in ingress */
+    ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
+                  "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
+                  "(ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
+                  " flags.tunnel_rx == 1",
+                  "next;", lflow_ref);
 
     /* Do not send service monitor packets to conntrack. */
     ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
diff --git a/tests/multinode.at b/tests/multinode.at
index 1e6eeb661..f616032ec 100644
--- a/tests/multinode.at
+++ b/tests/multinode.at
@@ -933,7 +933,17 @@  M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | gre
 
 AT_CLEANUP
 
-AT_SETUP([ovn multinode pmtu - logical switch - geneve])
+m4_define([PMTUD_SWITCH_TESTS],
+  [
+    AT_SETUP([ovn multinode pmtu - logical switch - $1])
+    encap=$1
+    if test "$encap" = "vxlan"; then
+      encap_sys="vxlan_sys"
+      overhead=50
+    else
+      encap_sys="genev_sys"
+      overhead=58
+    fi
 
 # Check that ovn-fake-multinode setup is up and running
 check_fake_multinode_setup
@@ -947,12 +957,12 @@  m_as ovn-chassis-2 ip link del sw0p2-p
 # Reset geneve tunnels
 for c in ovn-chassis-1 ovn-chassis-2 ovn-gw-1
 do
-    m_as $c ovs-vsctl set open . external-ids:ovn-encap-type=geneve
+    m_as $c ovs-vsctl set open . external-ids:ovn-encap-type=$encap
 done
 
-OVS_WAIT_UNTIL([m_as ovn-chassis-1 ip link show | grep -q genev_sys])
-OVS_WAIT_UNTIL([m_as ovn-chassis-2 ip link show | grep -q genev_sys])
-OVS_WAIT_UNTIL([m_as ovn-gw-1 ip link show | grep -q genev_sys])
+OVS_WAIT_UNTIL([m_as ovn-chassis-1 ip link show | grep -q $encap_sys])
+OVS_WAIT_UNTIL([m_as ovn-chassis-2 ip link show | grep -q $encap_sys])
+OVS_WAIT_UNTIL([m_as ovn-gw-1 ip link show | grep -q $encap_sys])
 
 # Test East-West switching
 check multinode_nbctl ls-add sw0
@@ -1008,7 +1018,8 @@  M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.4 | F
 
 # Change ptmu for the geneve tunnel
 m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1
-M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 10.0.0.4 2>&1 | grep -q "message too long, mtu=1142"])
+mtu=$((1200 - overhead))
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 10.0.0.4 2>&1 | grep -q "message too long, mtu=$mtu"])
 
 M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
 M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
@@ -1021,16 +1032,22 @@  M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 20.0.0.3 | F
 
 # Change ptmu for the geneve tunnel
 m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1100 dev eth1
-M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 | grep -q "message too long, mtu=1042"])
+mtu=$((1100 - overhead))
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 | grep -q "message too long, mtu=$mtu"])
 
 M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
 M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
 M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
 
 m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
+mtu=$((1000 - overhead))
 for i in $(seq 30); do
 M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom | nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
 done
-M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 942'])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q "mtu $mtu"])
 
 AT_CLEANUP
+])
+
+PMTUD_SWITCH_TESTS(["geneve"])
+PMTUD_SWITCH_TESTS(["vxlan"])
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index a389d1988..eb44b8ef2 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -4605,6 +4605,7 @@  check_stateful_flows() {
     AT_CHECK([grep "ls_in_pre_lb" sw0flows | ovn_strip_lflows], [0], [dnl
   table=??(ls_in_pre_lb       ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_lb       ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
+  table=??(ls_in_pre_lb       ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) ||(ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_lb       ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_lb       ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_lb       ), priority=110  , match=(ip && inport == "sw0-lr0"), action=(next;)
@@ -4675,6 +4676,7 @@  AT_CAPTURE_FILE([sw0flows])
 
 AT_CHECK([grep "ls_in_pre_lb" sw0flows | ovn_strip_lflows], [0], [dnl
   table=??(ls_in_pre_lb       ), priority=0    , match=(1), action=(next;)
+  table=??(ls_in_pre_lb       ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) ||(ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_lb       ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_lb       ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_lb       ), priority=110  , match=(ip && inport == "sw0-lr0"), action=(next;)
@@ -8258,6 +8260,7 @@  AT_CHECK([ovn-sbctl dump-flows | grep -E "ls_.*_acl" | ovn_strip_lflows], [0], [
   table=??(ls_in_acl_hint     ), priority=7    , match=(ct.new && !ct.est), action=(reg0[[7]] = 1; reg0[[9]] = 1; next;)
   table=??(ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=100  , match=(ip), action=(reg0[[0]] = 1; next;)
+  table=??(ls_in_pre_acl      ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -8442,6 +8445,7 @@  AT_CHECK([ovn-sbctl dump-flows | grep -E "ls_.*_acl" | ovn_strip_lflows], [0], [
   table=??(ls_in_acl_hint     ), priority=7    , match=(ct.new && !ct.est), action=(reg0[[7]] = 1; reg0[[9]] = 1; next;)
   table=??(ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=100  , match=(ip), action=(reg0[[0]] = 1; next;)
+  table=??(ls_in_pre_acl      ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -8624,6 +8628,7 @@  AT_CHECK([ovn-sbctl dump-flows | grep -E "ls_.*_acl" | ovn_strip_lflows], [0], [
   table=??(ls_in_acl_hint     ), priority=7    , match=(ct.new && !ct.est), action=(reg0[[7]] = 1; reg0[[9]] = 1; next;)
   table=??(ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=100  , match=(ip), action=(reg0[[0]] = 1; next;)
+  table=??(ls_in_pre_acl      ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -11772,6 +11777,7 @@  check ovn-nbctl acl-add pg_dgw to-lport 1003 "outport == @pg_dgw && ip4" allow-r
 AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
   table=??(ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=100  , match=(ip), action=(reg0[[0]] = 1; next;)
+  table=??(ls_in_pre_acl      ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(ip && inport == "S1-R1"), action=(next;)
@@ -11789,6 +11795,7 @@  check ovn-nbctl --wait=sb lsp-set-options S1-R1 router-port=R1-S1 enable_router_
 AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
   table=??(ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=100  , match=(ip), action=(reg0[[0]] = 1; next;)
+  table=??(ls_in_pre_acl      ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -11817,6 +11824,7 @@  check ovn-nbctl --wait=sb lsp-set-options S1-R1 router-port=R1-S1 enable_router_
 AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
   table=??(ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=100  , match=(ip), action=(reg0[[0]] = 1; next;)
+  table=??(ls_in_pre_acl      ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(ip && inport == "S1-R1"), action=(next;)
@@ -11834,6 +11842,7 @@  check ovn-nbctl --wait=sb lsp-set-options S1-R1 router-port=R1-S1 enable_router_
 AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
   table=??(ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=100  , match=(ip), action=(reg0[[0]] = 1; next;)
+  table=??(ls_in_pre_acl      ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(nd || nd_rs || nd_ra || mldv1 || mldv2 || (udp && udp.src == 546 && udp.dst == 547)), action=(next;)
@@ -11848,6 +11857,7 @@  check ovn-nbctl --wait=sb lsp-set-options S1-R1 router-port=R1-S1
 AT_CHECK([ovn-sbctl dump-flows S1 | grep pre_acl | ovn_strip_lflows], [0], [dnl
   table=??(ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=100  , match=(ip), action=(reg0[[0]] = 1; next;)
+  table=??(ls_in_pre_acl      ), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(eth.mcast), action=(next;)
   table=??(ls_in_pre_acl      ), priority=110  , match=(ip && inport == "S1-R1"), action=(next;)