diff mbox series

[ovs-dev] ovn: add geneve PMTUD support

Message ID 9d44c99689fe17899ef9228c7149379929af3e80.1701167801.git.lorenzo.bianconi@redhat.com
State Superseded
Headers show
Series [ovs-dev] ovn: add geneve PMTUD support | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Lorenzo Bianconi Nov. 28, 2023, 11:50 a.m. UTC
Introduce specif flows for E/W ICMPv{4,6} packets if tunnelled packets
do not fit path MTU. This patch enable PMTUD for East/West Geneve traffic.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2241711
Tested-by: Jaime Ruiz <jcaamano@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
---
 NEWS                    |  1 +
 controller/physical.c   | 44 ++++++++++++++++++++++++++
 northd/northd.c         | 24 ++++++++++++++
 northd/ovn-northd.8.xml |  8 +++++
 tests/multinode.at      | 69 +++++++++++++++++++++++++++++++++++++++++
 tests/ovn-northd.at     |  6 ++++
 6 files changed, 152 insertions(+)
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index e10fb79dd..acb3b854f 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,7 @@  Post v23.09.0
     connection method and doesn't require additional probing.
     external_ids:ovn-openflow-probe-interval configuration option for
     ovn-controller no longer matters and is ignored.
+  - Enable PMTU discovery on geneve tunnels for E/W traffic.
 
 OVN v23.09.0 - 15 Sep 2023
 --------------------------
diff --git a/controller/physical.c b/controller/physical.c
index ba88e1d8b..dc6130745 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -2446,6 +2446,50 @@  physical_run(struct physical_ctx *p_ctx,
                         &ofpacts, hc_uuid);
     }
 
+    /* Add specif flows for E/W ICMPv{4,6} packets if tunnelled packets do not
+     * fit path MTU.
+     */
+    HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
+        if (tun->type == GENEVE) {
+            /* IPv4 */
+            struct match match = MATCH_CATCHALL_INITIALIZER;
+            match_set_in_port(&match, tun->ofport);
+            match_set_dl_type(&match, htons(ETH_TYPE_IP));
+            match_set_nw_proto(&match, IPPROTO_ICMP);
+            match_set_icmp_type(&match, 3);
+            match_set_icmp_code(&match, 4);
+
+            ofpbuf_clear(&ofpacts);
+            put_move(MFF_TUN_ID, 0,  MFF_LOG_DATAPATH, 0, 24, &ofpacts);
+            put_move(p_ctx->mff_ovn_geneve, 16, MFF_LOG_OUTPORT, 0, 16,
+                     &ofpacts);
+            put_move(p_ctx->mff_ovn_geneve, 0, MFF_LOG_INPORT, 0, 15,
+                     &ofpacts);
+            put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
+
+            ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match,
+                            &ofpacts, hc_uuid);
+            /* IPv6 */
+            match_init_catchall(&match);
+            match_set_in_port(&match, tun->ofport);
+            match_set_dl_type(&match, htons(ETH_TYPE_IPV6));
+            match_set_nw_proto(&match, IPPROTO_ICMPV6);
+            match_set_icmp_type(&match, 2);
+            match_set_icmp_code(&match, 0);
+
+            ofpbuf_clear(&ofpacts);
+            put_move(MFF_TUN_ID, 0,  MFF_LOG_DATAPATH, 0, 24, &ofpacts);
+            put_move(p_ctx->mff_ovn_geneve, 16, MFF_LOG_OUTPORT, 0, 16,
+                     &ofpacts);
+            put_move(p_ctx->mff_ovn_geneve, 0, MFF_LOG_INPORT, 0, 15,
+                     &ofpacts);
+            put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
+
+            ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match,
+                            &ofpacts, hc_uuid);
+        }
+    }
+
     /* Add VXLAN specific rules to transform port keys
      * from 12 bits to 16 bits used elsewhere. */
     HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
diff --git a/northd/northd.c b/northd/northd.c
index 29245d633..07dffb15a 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -12765,6 +12765,28 @@  build_lrouter_force_snat_flows(struct hmap *lflows, struct ovn_datapath *od,
     ds_destroy(&actions);
 }
 
+/* Following flows are used to manage traffic redirected by the kernel
+ * (e.g. ICMP errors packets) that enter the cluster from the geneve ports
+ */
+static void
+build_lrouter_redirected_traffic_flows(
+        struct ovn_port *op, struct hmap *lflows,
+        struct ds *match, struct ds *actions)
+{
+    ovs_assert(op->nbrp);
+    if (!is_l3dgw_port(op)) {
+        return;
+    }
+
+    ds_clear(match);
+    ds_put_format(match, "inport == %s && !is_chassis_resident(%s)",
+                  op->cr_port->json_key, op->cr_port->json_key);
+    ds_clear(actions);
+    ds_put_format(actions, "inport = %s; next;", op->json_key);
+    ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 120,
+                  ds_cstr(match), ds_cstr(actions));
+}
+
 static void
 build_lrouter_force_snat_flows_op(struct ovn_port *op,
                                   struct hmap *lflows,
@@ -16168,6 +16190,8 @@  build_lswitch_and_lrouter_iterate_by_lrp(struct ovn_port *op,
                                 &lsi->match, &lsi->actions, lsi->meter_groups);
     build_lrouter_force_snat_flows_op(op, lsi->lflows, &lsi->match,
                                       &lsi->actions);
+    build_lrouter_redirected_traffic_flows(op, lsi->lflows, &lsi->match,
+                                           &lsi->actions);
 }
 
 static void *
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 97718821f..972854684 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -2463,6 +2463,14 @@  output;
           (LBs, NAT).
         </p>
 
+        <p>
+          For each gateway port <var>GW</var> on a distributed logical router
+          a priority-120 flow that matches <code>inport == <var>cr-GW</var>
+          &amp;&amp; !is_chassis_resident(<var>cr-GW</var>)</code> where
+          <var>cr-GW</var> is the chassis resident port of <var>GW</var>,
+          stores <var>GW</var> as inport.
+        </p>
+
         <p>
           For a distributed logical router or for gateway router where
           the port is configured with <code>options:gateway_mtu</code>
diff --git a/tests/multinode.at b/tests/multinode.at
index 2b199b4bc..a051ea8c2 100644
--- a/tests/multinode.at
+++ b/tests/multinode.at
@@ -72,3 +72,72 @@  M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 20.0.0.3 | F
 ])
 
 AT_CLEANUP
+
+AT_SETUP([ovn multinode geneve pmtu])
+
+# Check that ovn-fake-multinode setup is up and running
+check_fake_multinode_setup
+
+# Delete the multinode NB and OVS resources before starting the test.
+cleanup_multinode_resources
+
+# Test East-West switching
+check multinode_nbctl ls-add sw0
+check multinode_nbctl lsp-add sw0 sw0-port1
+check multinode_nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:03 10.0.0.3 1000::3"
+check multinode_nbctl lsp-add sw0 sw0-port2
+check multinode_nbctl lsp-set-addresses sw0-port2 "50:54:00:00:00:04 10.0.0.4 1000::4"
+
+m_as ovn-chassis-1 /data/create_fake_vm.sh sw0-port1 sw0p1 50:54:00:00:00:03 10.0.0.3 24 10.0.0.1 1000::3/64 1000::a
+m_as ovn-chassis-2 /data/create_fake_vm.sh sw0-port2 sw0p2 50:54:00:00:00:04 10.0.0.4 24 10.0.0.1 1000::4/64 1000::a
+
+m_wait_for_ports_up
+
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.4 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Create the second logical switch with one port
+check multinode_nbctl ls-add sw1
+check multinode_nbctl lsp-add sw1 sw1-port1
+check multinode_nbctl lsp-set-addresses sw1-port1 "40:54:00:00:00:03 20.0.0.3 2000::3"
+
+# Create a logical router and attach both logical switches
+check multinode_nbctl lr-add lr0
+check multinode_nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01 10.0.0.1/24 1000::a/64
+check multinode_nbctl lsp-add sw0 sw0-lr0
+check multinode_nbctl lsp-set-type sw0-lr0 router
+check multinode_nbctl lsp-set-addresses sw0-lr0 router
+check multinode_nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0
+
+check multinode_nbctl lrp-add lr0 lr0-sw1 00:00:00:00:ff:02 20.0.0.1/24 2000::a/64
+check multinode_nbctl lsp-add sw1 sw1-lr0
+check multinode_nbctl lsp-set-type sw1-lr0 router
+check multinode_nbctl lsp-set-addresses sw1-lr0 router
+check multinode_nbctl lsp-set-options sw1-lr0 router-port=lr0-sw1
+
+m_as ovn-chassis-2 /data/create_fake_vm.sh sw1-port1 sw1p1 40:54:00:00:00:03 20.0.0.3 24 20.0.0.1 2000::4/64 1000::a
+
+m_wait_for_ports_up sw1-port1
+
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 20.0.0.3 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Change ptmu for the geneve tunnel
+m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 |grep -q "message too long, mtu=1142"])
+
+check multinode_nbctl lrp-set-gateway-chassis lr0-sw0 ovn-chassis-1 10
+check multinode_nbctl lrp-set-gateway-chassis lr0-sw1 ovn-chassis-2 10
+
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
+
+m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 |grep -q "message too long, mtu=1142"])
+
+AT_CLEANUP
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index a8099786c..e7910e83c 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -6480,6 +6480,9 @@  AT_CAPTURE_FILE([lrflows])
 
 # Check the flows in lr_in_admission stage
 AT_CHECK([grep lr_in_admission lrflows | grep cr-DR | sort], [0], [dnl
+  table=0 (lr_in_admission    ), priority=120  , match=(inport == "cr-DR-S1" && !is_chassis_resident("cr-DR-S1")), action=(inport = "DR-S1"; next;)
+  table=0 (lr_in_admission    ), priority=120  , match=(inport == "cr-DR-S2" && !is_chassis_resident("cr-DR-S2")), action=(inport = "DR-S2"; next;)
+  table=0 (lr_in_admission    ), priority=120  , match=(inport == "cr-DR-S3" && !is_chassis_resident("cr-DR-S3")), action=(inport = "DR-S3"; next;)
   table=0 (lr_in_admission    ), priority=50   , match=(eth.dst == 02:ac:10:01:00:01 && inport == "DR-S1" && is_chassis_resident("cr-DR-S1")), action=(xreg0[[0..47]] = 02:ac:10:01:00:01; next;)
   table=0 (lr_in_admission    ), priority=50   , match=(eth.dst == 03:ac:10:01:00:01 && inport == "DR-S2" && is_chassis_resident("cr-DR-S2")), action=(xreg0[[0..47]] = 03:ac:10:01:00:01; next;)
   table=0 (lr_in_admission    ), priority=50   , match=(eth.dst == 04:ac:10:01:00:01 && inport == "DR-S3" && is_chassis_resident("cr-DR-S3")), action=(xreg0[[0..47]] = 04:ac:10:01:00:01; next;)
@@ -6539,6 +6542,7 @@  AT_CAPTURE_FILE([lrflows])
 
 # Check the flows in lr_in_admission stage
 AT_CHECK([grep lr_in_admission lrflows | grep lrp1 | sed 's/table=../table=??/' | sort], [0], [dnl
+  table=??(lr_in_admission    ), priority=120  , match=(inport == "cr-lrp1" && !is_chassis_resident("cr-lrp1")), action=(inport = "lrp1"; next;)
   table=??(lr_in_admission    ), priority=50   , match=(eth.dst == 00:00:00:00:00:01 && inport == "lrp1" && is_chassis_resident("cr-lrp1")), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;)
   table=??(lr_in_admission    ), priority=50   , match=(eth.mcast && inport == "lrp1"), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;)
 ])
@@ -6560,6 +6564,7 @@  AT_CAPTURE_FILE([lrflows])
 
 # Check the flows in lr_in_admission stage
 AT_CHECK([grep lr_in_admission lrflows | grep lrp1 | sed 's/table=../table=??/' | sort], [0], [dnl
+  table=??(lr_in_admission    ), priority=120  , match=(inport == "cr-lrp1" && !is_chassis_resident("cr-lrp1")), action=(inport = "lrp1"; next;)
   table=??(lr_in_admission    ), priority=50   , match=(eth.dst == 00:00:00:00:00:01 && inport == "lrp1"), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;)
   table=??(lr_in_admission    ), priority=50   , match=(eth.mcast && inport == "lrp1"), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;)
 ])
@@ -6578,6 +6583,7 @@  AT_CAPTURE_FILE([lrflows])
 
 # Check the flows in lr_in_admission stage
 AT_CHECK([grep lr_in_admission lrflows | grep lrp1 | sed 's/table=../table=??/' | sort], [0], [dnl
+  table=??(lr_in_admission    ), priority=120  , match=(inport == "cr-lrp1" && !is_chassis_resident("cr-lrp1")), action=(inport = "lrp1"; next;)
   table=??(lr_in_admission    ), priority=50   , match=(eth.dst == 00:00:00:00:00:01 && inport == "lrp1" && is_chassis_resident("cr-lrp1")), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;)
   table=??(lr_in_admission    ), priority=50   , match=(eth.mcast && inport == "lrp1"), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;)
 ])