@@ -388,9 +388,24 @@ struct meter_band_entry {
static struct shash meter_bands;
+static struct hmap ecmp_nexthop_map;
+struct ecmp_nexthop_entry {
+ struct hmap_node node;
+ bool erase;
+
+ char *nexthop;
+ int id;
+};
+
static void ofctrl_meter_bands_destroy(void);
static void ofctrl_meter_bands_clear(void);
+static void ecmp_nexthop_monitor_destroy(void);
+static void ecmp_nexthop_monitor_run(
+ const struct sbrec_ecmp_nexthop_table *enh_table,
+ struct ovs_list *msgs);
+
+
/* MFF_* field ID for our Geneve option. In S_TLV_TABLE_MOD_SENT, this is
* the option we requested (we don't know whether we obtained it yet). In
* S_CLEAR_FLOWS or S_UPDATE_FLOWS, this is really the option we have. */
@@ -429,6 +444,7 @@ ofctrl_init(struct ovn_extend_table *group_table,
groups = group_table;
meters = meter_table;
shash_init(&meter_bands);
+ hmap_init(&ecmp_nexthop_map);
}
/* S_NEW, for a new connection.
@@ -883,6 +899,7 @@ ofctrl_destroy(void)
expr_symtab_destroy(&symtab);
shash_destroy(&symtab);
ofctrl_meter_bands_destroy();
+ ecmp_nexthop_monitor_destroy();
}
uint64_t
@@ -2306,6 +2323,88 @@ add_meter(struct ovn_extend_table_info *m_desired,
ofctrl_meter_bands_alloc(sb_meter, m_desired, msgs);
}
+static void
+ecmp_nexthop_monitor_free_entry(struct ecmp_nexthop_entry *e,
+ struct ovs_list *msgs)
+{
+ if (msgs) {
+ ovs_u128 mask = {
+ /* ct_labels.label BITS[96-127] */
+ .u64.hi = 0xffffffff00000000,
+ };
+ uint64_t id = e->id;
+ ovs_u128 nexthop = {
+ .u64.hi = id << 32,
+ };
+ struct ofp_ct_match match = {
+ .labels = nexthop,
+ .labels_mask = mask,
+ };
+ struct ofpbuf *msg = ofp_ct_match_encode(&match, NULL,
+ rconn_get_version(swconn));
+ ovs_list_push_back(msgs, &msg->list_node);
+ }
+ free(e->nexthop);
+ free(e);
+}
+
+static void
+ecmp_nexthop_monitor_destroy(void)
+{
+ struct ecmp_nexthop_entry *e;
+ HMAP_FOR_EACH_POP (e, node, &ecmp_nexthop_map) {
+ ecmp_nexthop_monitor_free_entry(e, NULL);
+ }
+ hmap_destroy(&ecmp_nexthop_map);
+}
+
+static struct ecmp_nexthop_entry *
+ecmp_nexthop_monitor_lookup(char *nexthop)
+{
+ uint32_t hash = hash_string(nexthop, 0);
+ struct ecmp_nexthop_entry *e;
+
+ HMAP_FOR_EACH_WITH_HASH (e, node, hash, &ecmp_nexthop_map) {
+ if (!strcmp(e->nexthop, nexthop)) {
+ return e;
+ }
+ }
+ return NULL;
+}
+
+static void
+ecmp_nexthop_monitor_run(const struct sbrec_ecmp_nexthop_table *enh_table,
+ struct ovs_list *msgs)
+{
+ struct ecmp_nexthop_entry *e;
+ HMAP_FOR_EACH (e, node, &ecmp_nexthop_map) {
+ e->erase = true;
+ }
+
+ const struct sbrec_ecmp_nexthop *sbrec_ecmp_nexthop;
+ SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sbrec_ecmp_nexthop, enh_table) {
+ e = ecmp_nexthop_monitor_lookup(sbrec_ecmp_nexthop->nexthop);
+ if (!e) {
+ e = xzalloc(sizeof *e);
+ e->nexthop = xstrdup(sbrec_ecmp_nexthop->nexthop);
+ e->id = smap_get_int(&sbrec_ecmp_nexthop->options,
+ "nexthop-id", -1);
+ uint32_t hash = hash_string(e->nexthop, 0);
+ hmap_insert(&ecmp_nexthop_map, &e->node, hash);
+ } else {
+ e->erase = false;
+ }
+ }
+
+ HMAP_FOR_EACH_SAFE (e, node, &ecmp_nexthop_map) {
+ if (e->erase) {
+ hmap_remove(&ecmp_nexthop_map, &e->node);
+ ecmp_nexthop_monitor_free_entry(e, msgs);
+ }
+ }
+
+}
+
static void
installed_flow_add(struct ovn_flow *d,
struct ofputil_bundle_ctrl_msg *bc,
@@ -2664,6 +2763,7 @@ ofctrl_put(struct ovn_desired_flow_table *lflow_table,
struct shash *pending_ct_zones,
struct hmap *pending_lb_tuples,
struct ovsdb_idl_index *sbrec_meter_by_name,
+ const struct sbrec_ecmp_nexthop_table *enh_table,
uint64_t req_cfg,
bool lflows_changed,
bool pflows_changed)
@@ -2704,6 +2804,8 @@ ofctrl_put(struct ovn_desired_flow_table *lflow_table,
/* OpenFlow messages to send to the switch to bring it up-to-date. */
struct ovs_list msgs = OVS_LIST_INITIALIZER(&msgs);
+ ecmp_nexthop_monitor_run(enh_table, &msgs);
+
/* Iterate through ct zones that need to be flushed. */
struct shash_node *iter;
SHASH_FOR_EACH(iter, pending_ct_zones) {
@@ -31,6 +31,7 @@ struct ofpbuf;
struct ovsrec_bridge;
struct ovsrec_open_vswitch_table;
struct sbrec_meter_table;
+struct sbrec_ecmp_nexthop_table;
struct shash;
struct ovn_desired_flow_table {
@@ -59,6 +60,7 @@ void ofctrl_put(struct ovn_desired_flow_table *lflow_table,
struct shash *pending_ct_zones,
struct hmap *pending_lb_tuples,
struct ovsdb_idl_index *sbrec_meter_by_name,
+ const struct sbrec_ecmp_nexthop_table *enh_table,
uint64_t nb_cfg,
bool lflow_changed,
bool pflow_changed);
@@ -5945,6 +5945,8 @@ main(int argc, char *argv[])
&ct_zones_data->pending,
&lb_data->removed_tuples,
sbrec_meter_by_name,
+ sbrec_ecmp_nexthop_table_get(
+ ovnsb_idl_loop.idl),
ofctrl_seqno_get_req_cfg(),
engine_node_changed(&en_lflow_output),
engine_node_changed(&en_pflow_output));
@@ -1054,3 +1054,266 @@ OVS_TRAFFIC_VSWITCHD_STOP(["
"])
AT_CLEANUP
])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ECMP symmetric reply - kmod])
+AT_KEYWORDS([ecmp])
+
+CHECK_CONNTRACK()
+ovn_start
+
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+ -- set Open_vSwitch . external-ids:system-id=hv1 \
+ -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+ -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+ -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+ -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# Alice is connected to gateway router R1. R1 is connected to two "external"
+# routers, R2 and R3 via an "ext" switch.
+# Bob is connected to both R2 and R3. R1 contains two ECMP routes, one through R2
+# and one through R3, to Bob.
+#
+# alice -- R1 -- ext ---- R2
+# | \
+# | bob
+# | /
+# + ----- R3
+#
+# For this test, Bob sends request traffic through R2 to Alice. We want to ensure that
+# all response traffic from Alice is routed through R2 as well.
+
+ovn-nbctl create Logical_Router name=R1 options:chassis=hv1
+ovn-nbctl create Logical_Router name=R2
+ovn-nbctl create Logical_Router name=R3
+
+ovn-nbctl ls-add alice
+ovn-nbctl ls-add bob
+ovn-nbctl ls-add ext
+
+# connect alice to R1
+ovn-nbctl lrp-add R1 alice 00:00:01:01:02:03 10.0.0.1/24 fd01::1/64
+ovn-nbctl lsp-add alice rp-alice -- set Logical_Switch_Port rp-alice \
+ type=router options:router-port=alice addresses='"00:00:01:01:02:03"'
+
+# connect bob to R2
+ovn-nbctl lrp-add R2 R2_bob 00:00:02:01:02:03 172.16.0.2/16 fd07::2/64
+ovn-nbctl lsp-add bob rp2-bob -- set Logical_Switch_Port rp2-bob \
+ type=router options:router-port=R2_bob addresses='"00:00:02:01:02:03"'
+
+# connect bob to R3
+ovn-nbctl lrp-add R3 R3_bob 00:00:02:01:02:04 172.16.0.3/16 fd07::3/64
+ovn-nbctl lsp-add bob rp3-bob -- set Logical_Switch_Port rp3-bob \
+ type=router options:router-port=R3_bob addresses='"00:00:02:01:02:04"'
+
+# Connect R1 to ext
+ovn-nbctl lrp-add R1 R1_ext 00:00:04:01:02:03 20.0.0.1/24 fd02::1/64
+ovn-nbctl lsp-add ext r1-ext -- set Logical_Switch_Port r1-ext \
+ type=router options:router-port=R1_ext addresses='"00:00:04:01:02:03"'
+
+# Connect R2 to ext
+ovn-nbctl lrp-add R2 R2_ext 00:00:04:01:02:04 20.0.0.2/24 fd02::2/64
+ovn-nbctl lsp-add ext r2-ext -- set Logical_Switch_Port r2-ext \
+ type=router options:router-port=R2_ext addresses='"00:00:04:01:02:04"'
+
+# Connect R3 to ext
+ovn-nbctl lrp-add R3 R3_ext 00:00:04:01:02:05 20.0.0.3/24 fd02::3/64
+ovn-nbctl lsp-add ext r3-ext -- set Logical_Switch_Port r3-ext \
+ type=router options:router-port=R3_ext addresses='"00:00:04:01:02:05"'
+
+# Install ECMP routes for alice.
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 10.0.0.0/24 20.0.0.2
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 10.0.0.0/24 20.0.0.3
+
+# Static Routes
+ovn-nbctl lr-route-add R2 10.0.0.0/24 20.0.0.1
+ovn-nbctl lr-route-add R3 10.0.0.0/24 20.0.0.1
+
+# Logical port 'alice1' in switch 'alice'.
+ADD_NAMESPACES(alice1)
+# Only send 1 router solicitation as any additional ones can cause datapath
+# flows to get evicted, causing unexpected failures below.
+NS_CHECK_EXEC([alice1], [sysctl -w net.ipv6.conf.default.router_solicitations=1], [0], [dnl
+net.ipv6.conf.default.router_solicitations = 1
+])
+ADD_VETH(alice1, alice1, br-int, "10.0.0.2/24", "f0:00:00:01:02:04", \
+ "10.0.0.1")
+NS_CHECK_EXEC([alice1], [ip -6 addr add fd01::2/64 dev alice1 nodad])
+NS_CHECK_EXEC([alice1], [ip -6 route add default via fd01::1])
+NS_CHECK_EXEC([alice1], [ip -6 neigh add fd01::1 lladdr 00:00:01:01:02:03 dev alice1], [0])
+ovn-nbctl lsp-add alice alice1 \
+-- lsp-set-addresses alice1 "f0:00:00:01:02:04 10.0.0.2 fd01::2"
+
+# Logical port 'bob1' in switch 'bob'.
+ADD_NAMESPACES(bob1)
+# Only send 1 router solicitation as any additional ones can cause datapath
+# flows to get evicted, causing unexpected failures below.
+NS_CHECK_EXEC([bob1], [sysctl -w net.ipv6.conf.default.router_solicitations=1], [0], [dnl
+net.ipv6.conf.default.router_solicitations = 1
+])
+ADD_VETH(bob1, bob1, br-int, "172.16.0.1/16", "f0:00:00:01:02:06", \
+ "172.16.0.2")
+NS_CHECK_EXEC([bob1], [ip -6 addr add fd07::1/64 dev bob1 nodad])
+NS_CHECK_EXEC([bob1], [ip -6 route add default via fd07::2])
+NS_CHECK_EXEC([bob1], [ip -6 neigh add fd07::2 lladdr 00:00:02:01:02:03 dev bob1])
+NS_CHECK_EXEC([bob1], [ip -6 neigh add fd07::3 lladdr 00:00:01:01:02:04 dev bob1])
+
+# Add neighbour MAC addresses to avoid sending IPv6 NS messages which could
+# cause datapath flows to be evicted
+ovn-nbctl lsp-add bob bob1 \
+-- lsp-set-addresses bob1 "f0:00:00:01:02:06 172.16.0.1 fd07::1"
+
+# Ensure ovn-controller is caught up
+ovn-nbctl --wait=hv sync
+
+on_exit 'ovs-ofctl dump-flows br-int'
+
+NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
+NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
+NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Ensure conntrack entry is present. We should not try to predict
+# the tunnel key for the output port, so we strip it from the labels
+# and just ensure that the known ethernet address is present.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
+sed -e 's/labels=0x[[0-9]]/labels=0x?/'], [0], [dnl
+icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000000401020400000000
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000000401020400000000,protoinfo=(state=<cleared>)
+])
+
+# Ensure datapaths show conntrack states as expected
+# Like with conntrack entries, we shouldn't try to predict
+# port binding tunnel keys. So omit them from expected labels.
+ovs-appctl dpctl/dump-flows | grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x401020400000000/.*)'
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(+new-est-rpl+trk)' | grep '401020400000000' -c], [0], [dnl
+2
+])
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(-new+est+rpl+trk)' | grep '401020400000000)' -c], [0], [dnl
+2
+])
+
+# Flush conntrack entries for easier output parsing of next test.
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+# Change bob1 L2 address anche check the reply is properly updated.
+ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
+ovn-nbctl set Logical_Switch_Port r2-ext \
+ type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
+
+# Wait for ovn-controller before sending traffic
+ovn-nbctl --wait=hv sync
+
+NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
+NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(+new-est-rpl+trk)' | grep '1001020400000000/.*)' -c], [0], [dnl
+2
+])
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(-new+est+rpl+trk)' | grep '1001020400000000)' -c], [0], [dnl
+2
+])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 1001020400000000 | FORMAT_CT(172.16.0.1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
+sed -e 's/labels=0x[[0-9]]/labels=0x?/' | sort], [0], [dnl
+icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
+])
+# Check entries in table 76 and 77 expires w/o traffic
+OVS_WAIT_UNTIL([
+test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH_MAC, n_packets') -eq 0
+])
+OVS_WAIT_UNTIL([
+test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH, n_packets') -eq 0
+])
+
+# Flush connection tracking entries
+ovn-nbctl --wait=hv lr-route-del R1
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])
+
+# Install ECMP routes for alice.
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 fd01::/126 fd02::2
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 fd01::/126 fd02::3
+
+# Static Routes
+ovn-nbctl lr-route-add R2 fd01::/64 fd02::1
+ovn-nbctl lr-route-add R3 fd01::/64 fd02::1
+
+NETNS_DAEMONIZE([alice1], [nc -6 -l -k 8080], [alice2.pid])
+NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 8080], [0])
+NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 fd01::2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Ensure conntrack entry is present. We should not try to predict
+# the tunnel key for the output port, so we strip it from the labels
+# and just ensure that the known ethernet address is present.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
+sed -e 's/labels=0x[[0-9]]/labels=0x?/' | sort], [0], [dnl
+icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
+])
+
+# Flush conntrack entries for easier output parsing of next test.
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Change bob1 L2 address anche check the reply is properly updated.
+ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
+ovn-nbctl --wait=hv set Logical_Switch_Port r2-ext \
+ type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
+
+NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 8080], [0])
+NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 fd01::2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 1001020400000000 | FORMAT_CT(fd01::2) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
+sed -e 's/labels=0x[[0-9]]/labels=0x?/'], [0], [dnl
+icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
+])
+
+# Flush connection tracking entries
+ovn-nbctl --wait=hv lr-route-del R1
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2)])
+
+ovs-ofctl dump-flows br-int
+
+OVS_APP_EXIT_AND_WAIT([ovn-controller])
+
+as ovn-sb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as ovn-nb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as northd
+OVS_APP_EXIT_AND_WAIT([ovn-northd])
+
+as
+OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
+/connection dropped.*/d"])
+
+AT_CLEANUP
+])
@@ -6175,6 +6175,10 @@ OVS_WAIT_UNTIL([
test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH, n_packets') -eq 0
])
+# Flush connection tracking entries
+ovn-nbctl --wait=hv lr-route-del R1
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])
+
ovs-ofctl dump-flows br-int
OVS_APP_EXIT_AND_WAIT([ovn-controller])
Introduce ecmp_nexthop_monitor in ovn-controller in order to track and flush ecmp-symmetric reply ct entires when requested by the CMS (e.g removing the related static routes). Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com> --- controller/ofctrl.c | 102 ++++++++++++++ controller/ofctrl.h | 2 + controller/ovn-controller.c | 2 + tests/system-ovn-kmod.at | 263 ++++++++++++++++++++++++++++++++++++ tests/system-ovn.at | 4 + 5 files changed, 373 insertions(+)