diff mbox series

[ovs-dev,RFC,4/4] ofctrl: Introduce ecmp_nexthop_monitor.

Message ID 0ffa55e773d692da73ce8af003d9d2edf840feb0.1726222583.git.lorenzo.bianconi@redhat.com
State New
Headers show
Series Introduce ECMP_nexthop monitor in ovn-controller | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Lorenzo Bianconi Sept. 13, 2024, 10:29 a.m. UTC
Introduce ecmp_nexthop_monitor in ovn-controller in order to track and
flush ecmp-symmetric reply ct entires when requested by the CMS (e.g
removing the related static ecmp routes). CT entries are flushed using
the ethernet mac address stored in ct_label.

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
---
 controller/ofctrl.c         |  73 ++++++++++++++
 controller/ofctrl.h         |   2 +
 controller/ovn-controller.c |   2 +
 tests/system-ovn.at         | 185 ++++++++++++++++++++++++++++++++++++
 4 files changed, 262 insertions(+)
diff mbox series

Patch

diff --git a/controller/ofctrl.c b/controller/ofctrl.c
index f9387d375..5987e9be3 100644
--- a/controller/ofctrl.c
+++ b/controller/ofctrl.c
@@ -45,6 +45,7 @@ 
 #include "ovn/actions.h"
 #include "lib/extend-table.h"
 #include "lib/lb.h"
+#include "lib/ovn-util.h"
 #include "openvswitch/poll-loop.h"
 #include "physical.h"
 #include "openvswitch/rconn.h"
@@ -392,6 +393,11 @@  static struct shash meter_bands;
 static void ofctrl_meter_bands_destroy(void);
 static void ofctrl_meter_bands_clear(void);
 
+static struct smap ecmp_nexthop;
+static void ecmp_nexthop_monitor_run(
+        const struct sbrec_ecmp_nexthop_table *enh_table,
+        struct ovs_list *msgs);
+
 /* MFF_* field ID for our Geneve option.  In S_TLV_TABLE_MOD_SENT, this is
  * the option we requested (we don't know whether we obtained it yet).  In
  * S_CLEAR_FLOWS or S_UPDATE_FLOWS, this is really the option we have. */
@@ -425,6 +431,7 @@  ofctrl_init(struct ovn_extend_table *group_table,
     tx_counter = rconn_packet_counter_create();
     hmap_init(&installed_lflows);
     hmap_init(&installed_pflows);
+    smap_init(&ecmp_nexthop);
     ovs_list_init(&flow_updates);
     ovn_init_symtab(&symtab);
     groups = group_table;
@@ -877,6 +884,7 @@  ofctrl_destroy(void)
     expr_symtab_destroy(&symtab);
     shash_destroy(&symtab);
     ofctrl_meter_bands_destroy();
+    smap_destroy(&ecmp_nexthop);
 }
 
 uint64_t
@@ -2306,6 +2314,68 @@  add_meter(struct ovn_extend_table_info *m_desired,
     ofctrl_meter_bands_alloc(sb_meter, m_desired, msgs);
 }
 
+static void
+ecmp_nexthop_monitor_flush_ct_entry(const char *mac, struct ovs_list *msgs)
+{
+    struct eth_addr ea;
+    if (!ovs_scan(mac, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(ea))) {
+        return;
+    }
+
+    ovs_u128 mask = {
+        /* ct_label.ecmp_reply_eth BITS[32-79] */
+        .u64.hi = 0x000000000000ffff,
+        .u64.lo = 0xffffffff00000000,
+    };
+
+    ovs_be32 lo = get_unaligned_be32((void *)&ea.be16[1]);
+    ovs_u128 nexthop = {
+        .u64.hi = ntohs(ea.be16[0]),
+        .u64.lo = (uint64_t) ntohl(lo) << 32,
+    };
+
+    struct ofp_ct_match match = {
+        .labels = nexthop,
+        .labels_mask = mask,
+    };
+    struct ofpbuf *msg = ofp_ct_match_encode(&match, NULL,
+                                             rconn_get_version(swconn));
+    ovs_list_push_back(msgs, &msg->list_node);
+}
+
+static void
+ecmp_nexthop_monitor_run(const struct sbrec_ecmp_nexthop_table *enh_table,
+                         struct ovs_list *msgs)
+{
+    struct sset ecmp_sb_only = SSET_INITIALIZER(&ecmp_sb_only);
+    struct simap mac_count = SIMAP_INITIALIZER(&mac_count);
+
+    struct smap_node *node;
+    SMAP_FOR_EACH_SAFE (node, &ecmp_nexthop) {
+        simap_increase(&mac_count, node->value, 1);
+    }
+
+    const struct sbrec_ecmp_nexthop *sbrec_ecmp_nexthop;
+    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sbrec_ecmp_nexthop, enh_table) {
+        smap_replace(&ecmp_nexthop, sbrec_ecmp_nexthop->nexthop,
+                     sbrec_ecmp_nexthop->mac);
+        sset_add(&ecmp_sb_only, sbrec_ecmp_nexthop->nexthop);
+    }
+
+    SMAP_FOR_EACH_SAFE (node, &ecmp_nexthop) {
+        /* Do not flush CT entries if the share the same mac address. */
+        if (!sset_contains(&ecmp_sb_only, node->key)) {
+            if (simap_get(&mac_count, node->value) == 1) {
+                ecmp_nexthop_monitor_flush_ct_entry(node->value, msgs);
+            }
+            smap_remove_node(&ecmp_nexthop, node);
+        }
+    }
+
+    sset_destroy(&ecmp_sb_only);
+    simap_destroy(&mac_count);
+}
+
 static void
 installed_flow_add(struct ovn_flow *d,
                    struct ofputil_bundle_ctrl_msg *bc,
@@ -2664,6 +2734,7 @@  ofctrl_put(struct ovn_desired_flow_table *lflow_table,
            struct shash *pending_ct_zones,
            struct hmap *pending_lb_tuples,
            struct ovsdb_idl_index *sbrec_meter_by_name,
+           const struct sbrec_ecmp_nexthop_table *enh_table,
            uint64_t req_cfg,
            bool lflows_changed,
            bool pflows_changed)
@@ -2704,6 +2775,8 @@  ofctrl_put(struct ovn_desired_flow_table *lflow_table,
     /* OpenFlow messages to send to the switch to bring it up-to-date. */
     struct ovs_list msgs = OVS_LIST_INITIALIZER(&msgs);
 
+    ecmp_nexthop_monitor_run(enh_table, &msgs);
+
     /* Iterate through ct zones that need to be flushed. */
     struct shash_node *iter;
     SHASH_FOR_EACH(iter, pending_ct_zones) {
diff --git a/controller/ofctrl.h b/controller/ofctrl.h
index 129e3b6ad..33953a8a4 100644
--- a/controller/ofctrl.h
+++ b/controller/ofctrl.h
@@ -31,6 +31,7 @@  struct ofpbuf;
 struct ovsrec_bridge;
 struct ovsrec_open_vswitch_table;
 struct sbrec_meter_table;
+struct sbrec_ecmp_nexthop_table;
 struct shash;
 
 struct ovn_desired_flow_table {
@@ -59,6 +60,7 @@  void ofctrl_put(struct ovn_desired_flow_table *lflow_table,
                 struct shash *pending_ct_zones,
                 struct hmap *pending_lb_tuples,
                 struct ovsdb_idl_index *sbrec_meter_by_name,
+                const struct sbrec_ecmp_nexthop_table *enh_table,
                 uint64_t nb_cfg,
                 bool lflow_changed,
                 bool pflow_changed);
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index b08faaf09..481baa52b 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -5728,6 +5728,8 @@  main(int argc, char *argv[])
                                    &ct_zones_data->ctx.pending,
                                    &lb_data->removed_tuples,
                                    sbrec_meter_by_name,
+                                   sbrec_ecmp_nexthop_table_get(
+                                        ovnsb_idl_loop.idl),
                                    ofctrl_seqno_get_req_cfg(),
                                    engine_node_changed(&en_lflow_output),
                                    engine_node_changed(&en_pflow_output));
diff --git a/tests/system-ovn.at b/tests/system-ovn.at
index e065b1079..240a2c1f9 100644
--- a/tests/system-ovn.at
+++ b/tests/system-ovn.at
@@ -13884,3 +13884,188 @@  OVS_TRAFFIC_VSWITCHD_STOP(["/.*error receiving.*/d
 /.*terminating with signal 15.*/d"])
 AT_CLEANUP
 ])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ECMP Flush CT entries])
+AT_KEYWORDS([ecmp])
+ovn_start
+OVS_TRAFFIC_VSWITCHD_START()
+
+ADD_BR([br-int])
+ADD_BR([br-ext])
+ADD_BR([br-ecmp])
+
+ovs-ofctl add-flow br-ext action=normal
+ovs-ofctl add-flow br-ecmp action=normal
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+ovs-vsctl set Open_vSwitch . external-ids:arp-max-timeout-sec=1
+
+check ovn-nbctl lr-add R1
+check ovn-nbctl set logical_router R1 options:chassis=hv1
+
+check ovn-nbctl ls-add sw0
+check ovn-nbctl ls-add public
+
+check ovn-nbctl lrp-add R1 rp-sw0 00:00:01:01:02:03 192.168.1.1/24
+check ovn-nbctl lrp-add R1 rp-public 00:00:02:01:02:03 172.16.1.1/24
+
+ovn-nbctl lsp-add sw0 sw0-rp -- set Logical_Switch_Port sw0-rp \
+    type=router options:router-port=rp-sw0 \
+    -- lsp-set-addresses sw0-rp router
+
+ovn-nbctl lsp-add public public-rp -- set Logical_Switch_Port public-rp \
+    type=router options:router-port=rp-public \
+    -- lsp-set-addresses public-rp router
+
+ADD_NAMESPACES(alice)
+ADD_VETH(alice, alice, br-int, "192.168.1.2/24", "f0:00:00:01:02:03", \
+         "192.168.1.1")
+check ovn-nbctl lsp-add sw0 alice \
+    -- lsp-set-addresses alice "f0:00:00:01:02:03 192.168.1.2"
+
+AT_CHECK([ovs-vsctl set Open_vSwitch . external-ids:ovn-bridge-mappings=phynet:br-ext])
+check ovn-nbctl lsp-add public public1 \
+        -- lsp-set-addresses public1 unknown \
+        -- lsp-set-type public1 localnet \
+        -- lsp-set-options public1 network_name=phynet
+
+ADD_NAMESPACES(ecmp-path0)
+ADD_VETH(ecmp-p01, ecmp-path0, br-ext, "172.16.1.2/24", "f0:00:00:01:02:04", "172.16.1.1")
+ADD_VETH(ecmp-p02, ecmp-path0, br-ecmp, "172.16.2.2/24", "f0:00:00:01:03:04")
+
+ADD_NAMESPACES(ecmp-path1)
+ADD_VETH(ecmp-p11, ecmp-path1, br-ext, "172.16.1.3/24", "f0:00:00:01:02:05", "172.16.1.1")
+ADD_VETH(ecmp-p12, ecmp-path1, br-ecmp, "172.16.2.3/24", "f0:00:00:01:03:05")
+
+ADD_NAMESPACES(bob)
+ADD_VETH(bob, bob, br-ecmp, "172.16.2.10/24", "f0:00:00:01:02:06", "172.16.2.2")
+
+check ovn-nbctl --ecmp-symmetric-reply lr-route-add R1 172.16.2.0/24 172.16.1.2
+check ovn-nbctl --ecmp-symmetric-reply lr-route-add R1 172.16.2.0/24 172.16.1.3
+
+ovn-nbctl --wait=hv sync
+NETNS_DAEMONIZE([alice], [nc -l -k 80], [alice.pid])
+
+NS_CHECK_EXEC([bob], [ping -q -c 3 -i 0.3 -w 2 192.168.1.2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+NS_CHECK_EXEC([bob], [nc -z 192.168.1.2 80], [0])
+
+wait_row_count ECMP_Nexthop 2
+wait_column 'f0:00:00:01:02:04' ECMP_Nexthop mac nexthop='172.16.1.2'
+wait_column 'f0:00:00:01:02:05' ECMP_Nexthop mac nexthop='172.16.1.3'
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.2.10) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' | sort], [0], [dnl
+icmp,orig=(src=172.16.2.10,dst=192.168.1.2,id=<cleared>,type=8,code=0),reply=(src=192.168.1.2,dst=172.16.2.10,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0xf0000001020400000000
+tcp,orig=(src=172.16.2.10,dst=192.168.1.2,sport=<cleared>,dport=<cleared>),reply=(src=192.168.1.2,dst=172.16.2.10,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0xf0000001020400000000,protoinfo=(state=<cleared>)
+])
+
+# Change bob default IP address
+NS_CHECK_EXEC([bob], [ip route del 0.0.0.0/0 via 172.16.2.2])
+NS_CHECK_EXEC([bob], [ip route add 0.0.0.0/0 via 172.16.2.3])
+
+NS_CHECK_EXEC([bob], [ping -q -c 3 -i 0.3 -w 2 192.168.1.2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+NS_CHECK_EXEC([bob], [nc -z 192.168.1.2 80], [0])
+
+wait_row_count ECMP_Nexthop 2
+check_column 'f0:00:00:01:02:04' ECMP_Nexthop mac nexthop='172.16.1.2'
+check_column 'f0:00:00:01:02:05' ECMP_Nexthop mac nexthop='172.16.1.3'
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.2.10) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' | sort], [0], [dnl
+icmp,orig=(src=172.16.2.10,dst=192.168.1.2,id=<cleared>,type=8,code=0),reply=(src=192.168.1.2,dst=172.16.2.10,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0xf0000001020400000000
+icmp,orig=(src=172.16.2.10,dst=192.168.1.2,id=<cleared>,type=8,code=0),reply=(src=192.168.1.2,dst=172.16.2.10,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0xf0000001020500000000
+tcp,orig=(src=172.16.2.10,dst=192.168.1.2,sport=<cleared>,dport=<cleared>),reply=(src=192.168.1.2,dst=172.16.2.10,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0xf0000001020400000000,protoinfo=(state=<cleared>)
+tcp,orig=(src=172.16.2.10,dst=192.168.1.2,sport=<cleared>,dport=<cleared>),reply=(src=192.168.1.2,dst=172.16.2.10,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0xf0000001020500000000,protoinfo=(state=<cleared>)
+])
+
+# Remove first ECMP route
+check ovn-nbctl lr-route-del  R1 172.16.2.0/24 172.16.1.2
+wait_row_count ECMP_Nexthop 1
+check_column 'f0:00:00:01:02:05' ECMP_Nexthop mac nexthop='172.16.1.3'
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.2.10) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' | sort], [0], [dnl
+icmp,orig=(src=172.16.2.10,dst=192.168.1.2,id=<cleared>,type=8,code=0),reply=(src=192.168.1.2,dst=172.16.2.10,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0xf0000001020500000000
+tcp,orig=(src=172.16.2.10,dst=192.168.1.2,sport=<cleared>,dport=<cleared>),reply=(src=192.168.1.2,dst=172.16.2.10,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0xf0000001020500000000,protoinfo=(state=<cleared>)
+])
+
+# Add the route back and verify we do not flush if we have multiple next-hops with the same mac address
+check ovn-nbctl --ecmp-symmetric-reply lr-route-add R1 172.16.2.0/24 172.16.1.2
+wait_row_count ECMP_Nexthop 2
+wait_column 'f0:00:00:01:02:04' ECMP_Nexthop mac nexthop='172.16.1.2'
+wait_column 'f0:00:00:01:02:05' ECMP_Nexthop mac nexthop='172.16.1.3'
+
+NS_CHECK_EXEC([ecmp-path0], [ip link set dev ecmp-p01 address f0:00:00:01:02:05])
+wait_column 'f0:00:00:01:02:05' ECMP_Nexthop mac nexthop='172.16.1.2'
+
+# Change bob default IP address
+NS_CHECK_EXEC([bob], [ip route del 0.0.0.0/0 via 172.16.2.3])
+NS_CHECK_EXEC([bob], [ip route add 0.0.0.0/0 via 172.16.2.2])
+
+NS_CHECK_EXEC([bob], [ping -q -c 3 -i 0.3 -w 2 192.168.1.2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+NS_CHECK_EXEC([bob], [nc -z 192.168.1.2 80], [0])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.2.10) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' | sort], [0], [dnl
+icmp,orig=(src=172.16.2.10,dst=192.168.1.2,id=<cleared>,type=8,code=0),reply=(src=192.168.1.2,dst=172.16.2.10,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0xf0000001020500000000
+tcp,orig=(src=172.16.2.10,dst=192.168.1.2,sport=<cleared>,dport=<cleared>),reply=(src=192.168.1.2,dst=172.16.2.10,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0xf0000001020500000000,protoinfo=(state=<cleared>)
+])
+
+# Remove first ECMP route
+check ovn-nbctl lr-route-del R1 172.16.2.0/24 172.16.1.2
+wait_row_count ECMP_Nexthop 1
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.2.10) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' | sort], [0], [dnl
+icmp,orig=(src=172.16.2.10,dst=192.168.1.2,id=<cleared>,type=8,code=0),reply=(src=192.168.1.2,dst=172.16.2.10,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0xf0000001020500000000
+tcp,orig=(src=172.16.2.10,dst=192.168.1.2,sport=<cleared>,dport=<cleared>),reply=(src=192.168.1.2,dst=172.16.2.10,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0xf0000001020500000000,protoinfo=(state=<cleared>)
+])
+
+# Remove second ECMP route
+check ovn-nbctl lr-route-del R1
+wait_row_count ECMP_Nexthop 0
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.2.10) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' | sort], [0], [dnl
+])
+
+OVS_APP_EXIT_AND_WAIT([ovn-controller])
+
+as ovn-sb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as ovn-nb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as northd
+OVS_APP_EXIT_AND_WAIT([ovn-northd])
+
+as
+OVS_TRAFFIC_VSWITCHD_STOP(["/.*error receiving.*/d
+/.*terminating with signal 15.*/d"])
+AT_CLEANUP
+])