Message ID | 20240117054745.4027120-4-hzhou@ovn.org |
---|---|
State | Accepted |
Headers | show |
Series | Support VIF-based local encap IPs selection. | expand |
Context | Check | Description |
---|---|---|
ovsrobot/apply-robot | success | apply and check: success |
ovsrobot/github-robot-_Build_and_Test | success | github build: passed |
ovsrobot/github-robot-_ovn-kubernetes | success | github build: passed |
On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote: > Commit dd527a283cd8 partially supported multiple encap IPs. It supported > remote encap IP selection based on the destination VIF's encap_ip > configuration. This patch adds the support for selecting local encap IP > based on the source VIF's encap_ip configuration. > > Co-authored-by: Lei Huang <leih@nvidia.com> > Signed-off-by: Lei Huang <leih@nvidia.com> > Signed-off-by: Han Zhou <hzhou@ovn.org> > --- > Hi Han and Lei, thank you for the patch, I have a couple of comments/questions down below. NEWS | 3 + > controller/chassis.c | 2 +- > controller/local_data.c | 2 +- > controller/local_data.h | 2 +- > controller/ovn-controller.8.xml | 30 ++++++- > controller/ovn-controller.c | 49 ++++++++++++ > controller/physical.c | 134 ++++++++++++++++++++++---------- > controller/physical.h | 2 + > include/ovn/logical-fields.h | 4 +- > ovn-architecture.7.xml | 18 ++++- > tests/ovn.at | 51 +++++++++++- > 11 files changed, 243 insertions(+), 54 deletions(-) > > diff --git a/NEWS b/NEWS > index 5f267b4c64cc..5a3eed608617 100644 > --- a/NEWS > +++ b/NEWS > @@ -14,6 +14,9 @@ Post v23.09.0 > - ovn-northd-ddlog has been removed. > - A new LSP option "enable_router_port_acl" has been added to enable > conntrack for the router port whose peer is l3dgw_port if set it true. > + - Support selecting encapsulation IP based on the source/destination > VIF's > + settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for more > + details. > > OVN v23.09.0 - 15 Sep 2023 > -------------------------- > diff --git a/controller/chassis.c b/controller/chassis.c > index a6f13ccc42d5..55f2beb37674 100644 > --- a/controller/chassis.c > +++ b/controller/chassis.c > @@ -61,7 +61,7 @@ struct ovs_chassis_cfg { > > /* Set of encap types parsed from the 'ovn-encap-type' external-id. */ > struct sset encap_type_set; > - /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. */ > + /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */ > struct sset encap_ip_set; > /* Interface type list formatted in the OVN-SB Chassis required > format. */ > struct ds iface_types; > diff --git a/controller/local_data.c b/controller/local_data.c > index a9092783958f..8606414f8728 100644 > --- a/controller/local_data.c > +++ b/controller/local_data.c > @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap *chassis_tunnels) > */ > struct chassis_tunnel * > chassis_tunnel_find(const struct hmap *chassis_tunnels, const char > *chassis_id, > - char *remote_encap_ip, char *local_encap_ip) > + char *remote_encap_ip, const char *local_encap_ip) > nit: Unrelated change. { > /* > * If the specific encap_ip is given, look for the chassisid_ip entry, > diff --git a/controller/local_data.h b/controller/local_data.h > index bab95bcc3824..ca3905bd20e6 100644 > --- a/controller/local_data.h > +++ b/controller/local_data.h > @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes( > struct chassis_tunnel *chassis_tunnel_find(const struct hmap > *chassis_tunnels, > const char *chassis_id, > char *remote_encap_ip, > - char *local_encap_ip); > + const char *local_encap_ip); > Same as above. > bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels, > const char *chassis_name, > diff --git a/controller/ovn-controller.8.xml > b/controller/ovn-controller.8.xml > index efa65e3fd927..5ebef048d721 100644 > --- a/controller/ovn-controller.8.xml > +++ b/controller/ovn-controller.8.xml > @@ -176,10 +176,32 @@ > > <dt><code>external_ids:ovn-encap-ip</code></dt> > <dd> > - The IP address that a chassis should use to connect to this node > - using encapsulation types specified by > - <code>external_ids:ovn-encap-type</code>. Multiple encapsulation > IPs > - may be specified with a comma-separated list. > + <p> > + The IP address that a chassis should use to connect to this node > + using encapsulation types specified by > + <code>external_ids:ovn-encap-type</code>. Multiple > encapsulation IPs > + may be specified with a comma-separated list. > + </p> > + <p> > + In scenarios where multiple encapsulation IPs are present, > distinct > + tunnels are established for each remote chassis. These tunnels > are > + differentiated by setting unique <code>options:local_ip</code> > and > + <code>options:remote_ip</code> values in the tunnel interface. > When > + transmitting a packet to a remote chassis, the selection of > local_ip > + is guided by the <code>Interface:external_ids:encap-ip</code> > from > + the local OVSDB, corresponding to the VIF originating the > packet, if > + specified. The <code>Interface:external_ids:encap-ip</code> > setting > + of the VIF is also populated to the <code>Port_Binding</code> > + table in the OVN SB database via the <code>encap</code> column. > + Consequently, when a remote chassis needs to send a packet to a > + port-binding associated with this VIF, it utilizes the tunnel > with > + the appropriate <code>options:remote_ip</code> that matches the > + <code>ip</code> in <code>Port_Binding:encap</code>. This > mechanism > + is particularly beneficial for chassis with multiple physical > + interfaces designated for tunneling, where each interface is > + optimized for handling specific traffic associated with > particular > + VIFs. > + </p> > </dd> > > <dt><code>external_ids:ovn-encap-df_default</code></dt> > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > index 856e5e270822..4ab57fe970fe 100644 > --- a/controller/ovn-controller.c > +++ b/controller/ovn-controller.c > @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output { > struct physical_debug debug; > }; > > +static void > +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, > + size_t *n_encap_ips, const char * **encap_ips) > +{ > + const struct ovsrec_open_vswitch *cfg = > + ovsrec_open_vswitch_table_first(ovs_table); > + const char *chassis_id = get_ovs_chassis_id(ovs_table); > + const char *encap_ips_str = > + get_chassis_external_id_value(&cfg->external_ids, chassis_id, > + "ovn-encap-ip", NULL); > + struct sset encap_ip_set; > + sset_init(&encap_ip_set); > + sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); > + > + /* Sort the ips so that their index is deterministic. */ > + *encap_ips = sset_sort(&encap_ip_set); > + > + /* Copy the strings so that we can destroy the sset. */ > + for (size_t i = 0; (*encap_ips)[i]; i++) { > + (*encap_ips)[i] = xstrdup((*encap_ips)[i]); > + } > + *n_encap_ips = sset_count(&encap_ip_set); > + sset_destroy(&encap_ip_set); > +} > + > I wonder, could we store this array or maybe preferably svec in "en_non_vif_data" I-P node? This way it would be handled in the node and we could avoid the destroy calls after any pflow processing WDYT? > static void init_physical_ctx(struct engine_node *node, > struct ed_type_runtime_data *rt_data, > struct ed_type_non_vif_data *non_vif_data, > @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node > *node, > engine_get_input_data("ct_zones", node); > struct simap *ct_zones = &ct_zones_data->current; > > + parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips); > p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; > p_ctx->sbrec_port_binding_by_datapath = > sbrec_port_binding_by_datapath; > p_ctx->port_binding_table = port_binding_table; > @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct engine_node > *node, > p_ctx->patch_ofports = &non_vif_data->patch_ofports; > p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; > > + > + > > nit: Unrelated change. > struct controller_engine_ctx *ctrl_ctx = > engine_get_context()->client_ctx; > p_ctx->if_mgr = ctrl_ctx->if_mgr; > > pflow_output_get_debug(node, &p_ctx->debug); > } > > +static void > +destroy_physical_ctx(struct physical_ctx *p_ctx) > +{ > + for (size_t i = 0; i < p_ctx->n_encap_ips; i++) { > + free((char *)(p_ctx->encap_ips[i])); > + } > + free(p_ctx->encap_ips); > +} > + > static void * > en_pflow_output_init(struct engine_node *node OVS_UNUSED, > struct engine_arg *arg OVS_UNUSED) > @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, void > *data) > struct physical_ctx p_ctx; > init_physical_ctx(node, rt_data, non_vif_data, &p_ctx); > physical_run(&p_ctx, pflow_table); > + destroy_physical_ctx(&p_ctx); > > engine_set_node_state(node, EN_UPDATED); > } > @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct > engine_node *node, > bool removed = sbrec_port_binding_is_deleted(binding); > if (!physical_handle_flows_for_lport(binding, removed, > &p_ctx, > &pfo->flow_table)) { > + destroy_physical_ctx(&p_ctx); > return false; > } > } > @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct > engine_node *node, > bool removed = sbrec_port_binding_is_deleted(pb); > if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, > &pfo->flow_table)) { > + destroy_physical_ctx(&p_ctx); > return false; > } > } > engine_set_node_state(node, EN_UPDATED); > } > + destroy_physical_ctx(&p_ctx); > return true; > } > > @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct > engine_node *node, > bool removed = sbrec_port_binding_is_deleted(pb); > if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, > &pfo->flow_table)) { > + destroy_physical_ctx(&p_ctx); > return false; > } > } > > engine_set_node_state(node, EN_UPDATED); > + destroy_physical_ctx(&p_ctx); > return true; > } > > @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct > engine_node *node, void *data) > physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table); > > engine_set_node_state(node, EN_UPDATED); > + destroy_physical_ctx(&p_ctx); > return true; > } > > @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct engine_node > *node, void *data) > if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) { > /* Fall back to full recompute when a local datapath > * is added or deleted. */ > + destroy_physical_ctx(&p_ctx); > return false; > } > > @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct > engine_node *node, void *data) > lport->tracked_type == TRACKED_RESOURCE_REMOVED ? true: > false; > if (!physical_handle_flows_for_lport(lport->pb, removed, > &p_ctx, > &pfo->flow_table)) { > + destroy_physical_ctx(&p_ctx); > return false; > } > } > } > > engine_set_node_state(node, EN_UPDATED); > + destroy_physical_ctx(&p_ctx); > return true; > } > > @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct > engine_node *node, void *data) > if (pb) { > if (!physical_handle_flows_for_lport(pb, false, &p_ctx, > &pfo->flow_table)) { > + destroy_physical_ctx(&p_ctx); > return false; > } > tag_port_as_activated_in_engine(pp); > } > } > engine_set_node_state(node, EN_UPDATED); > + destroy_physical_ctx(&p_ctx); > return true; > } > > diff --git a/controller/physical.c b/controller/physical.c > index e93bfbd2cffb..c192aed751d5 100644 > --- a/controller/physical.c > +++ b/controller/physical.c > @@ -71,6 +71,8 @@ struct tunnel { > static void > load_logical_ingress_metadata(const struct sbrec_port_binding *binding, > const struct zone_ids *zone_ids, > + size_t n_encap_ips, > + const char **encap_ips, > struct ofpbuf *ofpacts_p); > static int64_t get_vxlan_port_key(int64_t port_key); > > @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf > *ofpacts) > */ > static struct chassis_tunnel * > get_port_binding_tun(const struct sbrec_encap *remote_encap, > - const struct sbrec_encap *local_encap, > const struct sbrec_chassis *chassis, > - const struct hmap *chassis_tunnels) > + const struct hmap *chassis_tunnels, > + const char *local_encap_ip) > { > struct chassis_tunnel *tun = NULL; > tun = chassis_tunnel_find(chassis_tunnels, chassis->name, > remote_encap ? remote_encap->ip : NULL, > - local_encap ? local_encap->ip : NULL); > + local_encap_ip); > > if (!tun) { > tun = chassis_tunnel_find(chassis_tunnels, chassis->name, NULL, > NULL); > @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct > sbrec_port_binding *pb, > static struct ovs_list * > get_remote_tunnels(const struct sbrec_port_binding *binding, > const struct sbrec_chassis *chassis, > - const struct hmap *chassis_tunnels) > + const struct hmap *chassis_tunnels, > + const char *local_encap_ip) > { > const struct chassis_tunnel *tun; > > @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding > *binding, > ovs_list_init(tunnels); > > if (binding->chassis && binding->chassis != chassis) { > - tun = get_port_binding_tun(binding->encap, NULL, binding->chassis, > - chassis_tunnels); > + tun = get_port_binding_tun(binding->encap, binding->chassis, > + chassis_tunnels, local_encap_ip); > if (!tun) { > static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); > VLOG_WARN_RL( > @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding > *binding, > } > const struct sbrec_encap *additional_encap; > additional_encap = find_additional_encap_for_chassis(binding, > chassis); > - tun = get_port_binding_tun(additional_encap, NULL, > + tun = get_port_binding_tun(additional_encap, > binding->additional_chassis[i], > - chassis_tunnels); > + chassis_tunnels, local_encap_ip); > if (!tun) { > static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); > VLOG_WARN_RL( > @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct > sbrec_port_binding *binding, > struct ofpbuf *ofpacts_p, > const struct sbrec_chassis *chassis, > const struct hmap *chassis_tunnels, > + size_t n_encap_ips, > + const char **encap_ips, > struct ovn_desired_flow_table > *flow_table) > { > /* Setup encapsulation */ > - struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > - chassis_tunnels); > - if (!ovs_list_is_empty(tuns)) { > - bool is_vtep_port = !strcmp(binding->type, "vtep"); > - /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for > ARP/ND > - * responder in L3 networks. */ > - if (is_vtep_port) { > - put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, > ofpacts_p); > - } > + for (size_t i = 0; i < n_encap_ips; i++) { > + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); > + > + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16, > + (uint32_t) 0xFFFF << 16); > + struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > + chassis_tunnels, > + encap_ips[i]); > + if (!ovs_list_is_empty(tuns)) { > + bool is_vtep_port = !strcmp(binding->type, "vtep"); > + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for > ARP/ND > + * responder in L3 networks. */ > + if (is_vtep_port) { > + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, > + ofpacts_clone); > + } > > - struct tunnel *tun; > - LIST_FOR_EACH (tun, list_node, tuns) { > - put_encapsulation(mff_ovn_geneve, tun->tun, > - binding->datapath, port_key, is_vtep_port, > - ofpacts_p); > - ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport; > + struct tunnel *tun; > + LIST_FOR_EACH (tun, list_node, tuns) { > + put_encapsulation(mff_ovn_geneve, tun->tun, > + binding->datapath, port_key, > is_vtep_port, > + ofpacts_clone); > + ofpact_put_OUTPUT(ofpacts_clone)->port = tun->tun->ofport; > + } > + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); > + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, > + binding->header_.uuid.parts[0], match, > + ofpacts_clone, &binding->header_.uuid); > } > - put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p); > - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, > - binding->header_.uuid.parts[0], match, ofpacts_p, > - &binding->header_.uuid); > - } > - struct tunnel *tun_elem; > - LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { > - free(tun_elem); > + struct tunnel *tun_elem; > + LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { > + free(tun_elem); > + } > + free(tuns); > + > + ofpbuf_delete(ofpacts_clone); > } > - free(tuns); > } > > static void > @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap > *ct_zones, > if (tag) { > ofpact_put_STRIP_VLAN(ofpacts_p); > } > - load_logical_ingress_metadata(localnet_port, &zone_ids, > ofpacts_p); > + load_logical_ingress_metadata(localnet_port, &zone_ids, 0, NULL, > + ofpacts_p); > replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p); > replace_mac->mac = router_port_mac; > > @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids, > struct ofpbuf *ofpacts_p) > { > if (zone_ids) { > if (zone_ids->ct) { > - put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); > + put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); > } > if (zone_ids->dnat) { > put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); > @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key, > } > } > > +static size_t > +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips, > + const char *ip) > +{ > + for (size_t i = 0; i < n_encap_ips; i++) { > + if (!strcmp(ip, encap_ips[i])) { > + return i; > + } > + } > + return 0; > +} > + > static void > load_logical_ingress_metadata(const struct sbrec_port_binding *binding, > const struct zone_ids *zone_ids, > + size_t n_encap_ips, > + const char **encap_ips, > struct ofpbuf *ofpacts_p) > { > put_zones_ofpacts(zone_ids, ofpacts_p); > @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct > sbrec_port_binding *binding, > uint32_t port_key = binding->tunnel_key; > put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p); > put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p); > + > + /* Default encap_id 0. */ > + size_t encap_id = 0; > + if (encap_ips && binding->encap) { > + encap_id = encap_ip_to_id(n_encap_ips, encap_ips, > + binding->encap->ip); > + } > + put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p); > } > > static const struct sbrec_port_binding * > @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct > sbrec_port_binding *binding, > match_set_dl_type(&match, htons(ETH_TYPE_RARP)); > match_set_in_port(&match, ofport); > > - load_logical_ingress_metadata(binding, zone_ids, &ofpacts); > + load_logical_ingress_metadata(binding, zone_ids, 0, NULL, &ofpacts); > > encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP, > NX_CTLR_NO_METER, &ofpacts); > @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports( > } > > struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > - chassis_tunnels); > + chassis_tunnels, NULL); > if (ovs_list_is_empty(tuns)) { > free(tuns); > return; > @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > const struct sbrec_chassis *chassis, > const struct physical_debug *debug, > const struct if_status_mgr *if_mgr, > + size_t n_encap_ips, > + const char **encap_ips, > struct ovn_desired_flow_table *flow_table, > struct ofpbuf *ofpacts_p) > { > @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > ofpact_put_CT_CLEAR(ofpacts_p); > put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); > put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p); > - put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); > + put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); > struct zone_ids peer_zones = get_zone_ids(peer, ct_zones); > - load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p); > + load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips, > + encap_ips, ofpacts_p); > put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p); > put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p); > for (int i = 0; i < MFF_N_LOG_REGS; i++) { > @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > * as we're going to remove this with ofpbuf_pull() later. */ > uint32_t ofpacts_orig_size = ofpacts_p->size; > > - load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p); > + load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips, > + encap_ips, ofpacts_p); > > if (!strcmp(binding->type, "localport")) { > /* mark the packet as incoming from a localport */ > @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > } else { > put_remote_port_redirect_overlay( > binding, mff_ovn_geneve, port_key, &match, ofpacts_p, > - chassis, chassis_tunnels, flow_table); > + chassis, chassis_tunnels, n_encap_ips, encap_ips, flow_table); > } > out: > if (ha_ch_ordered) { > @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > > int zone_id = simap_get(ct_zones, port->logical_port); > if (zone_id) { > - put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts); > + put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts); > } > > const char *lport_name = (port->parent_port && > *port->parent_port) ? > @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx > *p_ctx, > p_ctx->patch_ofports, > p_ctx->chassis_tunnels, > pb, p_ctx->chassis, &p_ctx->debug, > - p_ctx->if_mgr, flow_table, &ofpacts); > + p_ctx->if_mgr, > + p_ctx->n_encap_ips, > + p_ctx->encap_ips, > + flow_table, &ofpacts); > ofpbuf_uninit(&ofpacts); > } > > @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx, > p_ctx->patch_ofports, > p_ctx->chassis_tunnels, binding, > p_ctx->chassis, &p_ctx->debug, > - p_ctx->if_mgr, flow_table, &ofpacts); > + p_ctx->if_mgr, > + p_ctx->n_encap_ips, > + p_ctx->encap_ips, > + flow_table, &ofpacts); > } > > /* Handle output to multicast groups, in tables 40 and 41. */ > diff --git a/controller/physical.h b/controller/physical.h > index 1f1ed55efa16..7fe8ee3c18ed 100644 > --- a/controller/physical.h > +++ b/controller/physical.h > @@ -66,6 +66,8 @@ struct physical_ctx { > struct shash *local_bindings; > struct simap *patch_ofports; > struct hmap *chassis_tunnels; > + size_t n_encap_ips; > + const char **encap_ips; > struct physical_debug debug; > }; > > diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h > index 272277ec4fa0..d3455a462a0d 100644 > --- a/include/ovn/logical-fields.h > +++ b/include/ovn/logical-fields.h > @@ -37,7 +37,9 @@ enum ovn_controller_event { > #define MFF_LOG_SNAT_ZONE MFF_REG12 /* conntrack snat zone for gateway > router > * (32 bits). */ > #define MFF_LOG_CT_ZONE MFF_REG13 /* Logical conntrack zone for lports > - * (32 bits). */ > + * (0..15 of the 32 bits). */ > +#define MFF_LOG_ENCAP_ID MFF_REG13 /* Encap ID for lports > + * (16..31 of the 32 bits). */ > #define MFF_LOG_INPORT MFF_REG14 /* Logical input port (32 bits). */ > #define MFF_LOG_OUTPORT MFF_REG15 /* Logical output port (32 bits). */ > > diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml > index 96294fe2b795..bfd8680cedfc 100644 > --- a/ovn-architecture.7.xml > +++ b/ovn-architecture.7.xml > @@ -1247,8 +1247,8 @@ > chassis. This is initialized to 0 at the beginning of the logical > <!-- Keep the following in sync with MFF_LOG_CT_ZONE in > ovn/lib/logical-fields.h. --> > - ingress pipeline. OVN stores this in Open vSwitch extension > register > - number 13. > + ingress pipeline. OVN stores this in the lower 16 bits of the Open > + vSwitch extension register number 13. > </dd> > > <dt>conntrack zone fields for routers</dt> > @@ -1263,6 +1263,20 @@ > traffic (for SNATing) in Open vSwitch extension register number 12. > </dd> > > + <dt>Encap ID for logical ports</dt> > + <dd> > + A field that records an ID that indicates which encapsulation IP > should > + be used when sending packets to a remote chassis, according to the > + original input logical port. This is useful when there are multiple > IPs > + available for encapsulation. The value only has local significance > and is > + not meaningful between chassis. This is initialized to 0 at the > beginning > + of the logical > + <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in > + ovn/lib/logical-fields.h. --> > + ingress pipeline. OVN stores this in the higher 16 bits of the Open > + vSwitch extension register number 13. > + </dd> > + > <dt>logical flow flags</dt> > <dd> > The logical flags are intended to handle keeping context between > diff --git a/tests/ovn.at b/tests/ovn.at > index 243fe0b8246c..e7f0c9681f60 100644 > --- a/tests/ovn.at > +++ b/tests/ovn.at > @@ -30335,19 +30335,33 @@ AT_CLEANUP > > > OVN_FOR_EACH_NORTHD([ > -AT_SETUP([multiple encap ips tunnel creation]) > +AT_SETUP([multiple encap ips selection based on VIF's encap_ip]) > +AT_SKIP_IF([test $HAVE_SCAPY = no]) > ovn_start > net_add n1 > > +ovn-nbctl ls-add ls1 > + > # 2 HVs, each with 2 encap-ips. > +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY. > for i in 1 2; do > sim_add hv$i > as hv$i > ovs-vsctl add-br br-phys-$j > ovn_attach n1 br-phys-$j 192.168.0.${i}1 > ovs-vsctl set open . > external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 > + > + for j in 1 2; do > + ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ > + external_ids:iface-id=lsp$i$j \ > + external_ids:encap-ip=192.168.0.$i$j \ > + options:tx_pcap=hv$i/vif$i$j-tx.pcap > options:rxq_pcap=hv$i/vif$i$j-rx.pcap > + ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j > "f0:00:00:00:00:$i$j 10.0.0.$i$j" > + > + done > done > > +wait_for_ports_up > check ovn-nbctl --wait=hv sync > > check_tunnel_port() { > @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12 > %192.168.0.21 > check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22 > check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22 > > +vif_to_hv() { > + case $1 in dnl ( > + vif[[1]]?) echo hv1 ;; dnl ( > + vif[[2]]?) echo hv2 ;; dnl ( > + *) AT_FAIL_IF([:]) ;; > + esac > +} > + > +# check_packet_tunnel SRC_VIF DST_VIF > +# Verify that a packet from SRC_VIF to DST_VIF goes through the > corresponding > +# tunnel that matches the VIFs' encap_ip configurations. > +check_packet_tunnel() { > + local src=$1 dst=$2 > + local src_mac=f0:00:00:00:00:$src > + local dst_mac=f0:00:00:00:00:$dst > + local src_ip=10.0.0.$src > + local dst_ip=10.0.0.$dst > + local local_encap_ip=192.168.0.$src > + local remote_encap_ip=192.168.0.$dst > + local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ > + IP(dst='${dst_ip}', src='${src_ip}')/ \ > + ICMP(type=8)") > + hv=`vif_to_hv vif$src` > + as $hv > + echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> > $remote_encap_ip" > + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > + AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src > $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) > +} > + > +for i in 1 2; do > + for j in 1 2; do > + check_packet_tunnel 1$i 2$j > + done > +done > + > OVN_CLEANUP([hv1],[hv2]) > AT_CLEANUP > ]) > -- > 2.38.1 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > > Thanks, Ales
On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote: > > > > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote: >> >> Commit dd527a283cd8 partially supported multiple encap IPs. It supported >> remote encap IP selection based on the destination VIF's encap_ip >> configuration. This patch adds the support for selecting local encap IP >> based on the source VIF's encap_ip configuration. >> >> Co-authored-by: Lei Huang <leih@nvidia.com> >> Signed-off-by: Lei Huang <leih@nvidia.com> >> Signed-off-by: Han Zhou <hzhou@ovn.org> >> --- > > > Hi Han and Lei, > > thank you for the patch, I have a couple of comments/questions down below. Thanks Ales. > > >> NEWS | 3 + >> controller/chassis.c | 2 +- >> controller/local_data.c | 2 +- >> controller/local_data.h | 2 +- >> controller/ovn-controller.8.xml | 30 ++++++- >> controller/ovn-controller.c | 49 ++++++++++++ >> controller/physical.c | 134 ++++++++++++++++++++++---------- >> controller/physical.h | 2 + >> include/ovn/logical-fields.h | 4 +- >> ovn-architecture.7.xml | 18 ++++- >> tests/ovn.at | 51 +++++++++++- >> 11 files changed, 243 insertions(+), 54 deletions(-) >> >> diff --git a/NEWS b/NEWS >> index 5f267b4c64cc..5a3eed608617 100644 >> --- a/NEWS >> +++ b/NEWS >> @@ -14,6 +14,9 @@ Post v23.09.0 >> - ovn-northd-ddlog has been removed. >> - A new LSP option "enable_router_port_acl" has been added to enable >> conntrack for the router port whose peer is l3dgw_port if set it true. >> + - Support selecting encapsulation IP based on the source/destination VIF's >> + settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for more >> + details. >> >> OVN v23.09.0 - 15 Sep 2023 >> -------------------------- >> diff --git a/controller/chassis.c b/controller/chassis.c >> index a6f13ccc42d5..55f2beb37674 100644 >> --- a/controller/chassis.c >> +++ b/controller/chassis.c >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg { >> >> /* Set of encap types parsed from the 'ovn-encap-type' external-id. */ >> struct sset encap_type_set; >> - /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. */ >> + /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */ >> struct sset encap_ip_set; >> /* Interface type list formatted in the OVN-SB Chassis required format. */ >> struct ds iface_types; >> diff --git a/controller/local_data.c b/controller/local_data.c >> index a9092783958f..8606414f8728 100644 >> --- a/controller/local_data.c >> +++ b/controller/local_data.c >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap *chassis_tunnels) >> */ >> struct chassis_tunnel * >> chassis_tunnel_find(const struct hmap *chassis_tunnels, const char *chassis_id, >> - char *remote_encap_ip, char *local_encap_ip) >> + char *remote_encap_ip, const char *local_encap_ip) > > > nit: Unrelated change. Ack > > >> { >> /* >> * If the specific encap_ip is given, look for the chassisid_ip entry, >> diff --git a/controller/local_data.h b/controller/local_data.h >> index bab95bcc3824..ca3905bd20e6 100644 >> --- a/controller/local_data.h >> +++ b/controller/local_data.h >> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes( >> struct chassis_tunnel *chassis_tunnel_find(const struct hmap *chassis_tunnels, >> const char *chassis_id, >> char *remote_encap_ip, >> - char *local_encap_ip); >> + const char *local_encap_ip); > > > Same as above. Ack > > >> >> bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels, >> const char *chassis_name, >> diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml >> index efa65e3fd927..5ebef048d721 100644 >> --- a/controller/ovn-controller.8.xml >> +++ b/controller/ovn-controller.8.xml >> @@ -176,10 +176,32 @@ >> >> <dt><code>external_ids:ovn-encap-ip</code></dt> >> <dd> >> - The IP address that a chassis should use to connect to this node >> - using encapsulation types specified by >> - <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs >> - may be specified with a comma-separated list. >> + <p> >> + The IP address that a chassis should use to connect to this node >> + using encapsulation types specified by >> + <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs >> + may be specified with a comma-separated list. >> + </p> >> + <p> >> + In scenarios where multiple encapsulation IPs are present, distinct >> + tunnels are established for each remote chassis. These tunnels are >> + differentiated by setting unique <code>options:local_ip</code> and >> + <code>options:remote_ip</code> values in the tunnel interface. When >> + transmitting a packet to a remote chassis, the selection of local_ip >> + is guided by the <code>Interface:external_ids:encap-ip</code> from >> + the local OVSDB, corresponding to the VIF originating the packet, if >> + specified. The <code>Interface:external_ids:encap-ip</code> setting >> + of the VIF is also populated to the <code>Port_Binding</code> >> + table in the OVN SB database via the <code>encap</code> column. >> + Consequently, when a remote chassis needs to send a packet to a >> + port-binding associated with this VIF, it utilizes the tunnel with >> + the appropriate <code>options:remote_ip</code> that matches the >> + <code>ip</code> in <code>Port_Binding:encap</code>. This mechanism >> + is particularly beneficial for chassis with multiple physical >> + interfaces designated for tunneling, where each interface is >> + optimized for handling specific traffic associated with particular >> + VIFs. >> + </p> >> </dd> >> >> <dt><code>external_ids:ovn-encap-df_default</code></dt> >> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c >> index 856e5e270822..4ab57fe970fe 100644 >> --- a/controller/ovn-controller.c >> +++ b/controller/ovn-controller.c >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output { >> struct physical_debug debug; >> }; >> >> +static void >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, >> + size_t *n_encap_ips, const char * **encap_ips) >> +{ >> + const struct ovsrec_open_vswitch *cfg = >> + ovsrec_open_vswitch_table_first(ovs_table); >> + const char *chassis_id = get_ovs_chassis_id(ovs_table); >> + const char *encap_ips_str = >> + get_chassis_external_id_value(&cfg->external_ids, chassis_id, >> + "ovn-encap-ip", NULL); >> + struct sset encap_ip_set; >> + sset_init(&encap_ip_set); >> + sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); >> + >> + /* Sort the ips so that their index is deterministic. */ >> + *encap_ips = sset_sort(&encap_ip_set); >> + >> + /* Copy the strings so that we can destroy the sset. */ >> + for (size_t i = 0; (*encap_ips)[i]; i++) { >> + (*encap_ips)[i] = xstrdup((*encap_ips)[i]); >> + } >> + *n_encap_ips = sset_count(&encap_ip_set); >> + sset_destroy(&encap_ip_set); >> +} >> + > > > I wonder, could we store this array or maybe preferably svec in "en_non_vif_data" I-P node? This way it would be handled in the node and we could avoid the destroy calls after any pflow processing WDYT? Yes we could do the same in en_non_vif_data, but I think it is not really necessary and it seems more straightforward just parsing it here, because: 1. We don't need I-P for this ovn-encap-ip configuration change, so we don't have to persist this data. 2. It should be very rare to have more than 5 (or even 3) encap IPs per node, so the list should be very small and the cost of this parsing (and sset construct/destroy) is negligible. Using svec is also a valid option, but the sset (and array) is used here just to reuse the sset_from_delimited_string and sset_sort for convenience. I noticed that the sset_init() call can in fact be removed because sset_from_delimited_string already does that. I can remove it. Does this sound reasonable to you? Thanks, Han > >> >> static void init_physical_ctx(struct engine_node *node, >> struct ed_type_runtime_data *rt_data, >> struct ed_type_non_vif_data *non_vif_data, >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node *node, >> engine_get_input_data("ct_zones", node); >> struct simap *ct_zones = &ct_zones_data->current; >> >> + parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips); >> p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; >> p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath; >> p_ctx->port_binding_table = port_binding_table; >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct engine_node *node, >> p_ctx->patch_ofports = &non_vif_data->patch_ofports; >> p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; >> >> + >> + >> > > nit: Unrelated change. Ack > > >> >> struct controller_engine_ctx *ctrl_ctx = engine_get_context()->client_ctx; >> p_ctx->if_mgr = ctrl_ctx->if_mgr; >> >> pflow_output_get_debug(node, &p_ctx->debug); >> } >> >> +static void >> +destroy_physical_ctx(struct physical_ctx *p_ctx) >> +{ >> + for (size_t i = 0; i < p_ctx->n_encap_ips; i++) { >> + free((char *)(p_ctx->encap_ips[i])); >> + } >> + free(p_ctx->encap_ips); >> +} >> + >> static void * >> en_pflow_output_init(struct engine_node *node OVS_UNUSED, >> struct engine_arg *arg OVS_UNUSED) >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, void *data) >> struct physical_ctx p_ctx; >> init_physical_ctx(node, rt_data, non_vif_data, &p_ctx); >> physical_run(&p_ctx, pflow_table); >> + destroy_physical_ctx(&p_ctx); >> >> engine_set_node_state(node, EN_UPDATED); >> } >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct engine_node *node, >> bool removed = sbrec_port_binding_is_deleted(binding); >> if (!physical_handle_flows_for_lport(binding, removed, &p_ctx, >> &pfo->flow_table)) { >> + destroy_physical_ctx(&p_ctx); >> return false; >> } >> } >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct engine_node *node, >> bool removed = sbrec_port_binding_is_deleted(pb); >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, >> &pfo->flow_table)) { >> + destroy_physical_ctx(&p_ctx); >> return false; >> } >> } >> engine_set_node_state(node, EN_UPDATED); >> } >> + destroy_physical_ctx(&p_ctx); >> return true; >> } >> >> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct engine_node *node, >> bool removed = sbrec_port_binding_is_deleted(pb); >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, >> &pfo->flow_table)) { >> + destroy_physical_ctx(&p_ctx); >> return false; >> } >> } >> >> engine_set_node_state(node, EN_UPDATED); >> + destroy_physical_ctx(&p_ctx); >> return true; >> } >> >> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct engine_node *node, void *data) >> physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table); >> >> engine_set_node_state(node, EN_UPDATED); >> + destroy_physical_ctx(&p_ctx); >> return true; >> } >> >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct engine_node *node, void *data) >> if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) { >> /* Fall back to full recompute when a local datapath >> * is added or deleted. */ >> + destroy_physical_ctx(&p_ctx); >> return false; >> } >> >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct engine_node *node, void *data) >> lport->tracked_type == TRACKED_RESOURCE_REMOVED ? true: false; >> if (!physical_handle_flows_for_lport(lport->pb, removed, &p_ctx, >> &pfo->flow_table)) { >> + destroy_physical_ctx(&p_ctx); >> return false; >> } >> } >> } >> >> engine_set_node_state(node, EN_UPDATED); >> + destroy_physical_ctx(&p_ctx); >> return true; >> } >> >> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct engine_node *node, void *data) >> if (pb) { >> if (!physical_handle_flows_for_lport(pb, false, &p_ctx, >> &pfo->flow_table)) { >> + destroy_physical_ctx(&p_ctx); >> return false; >> } >> tag_port_as_activated_in_engine(pp); >> } >> } >> engine_set_node_state(node, EN_UPDATED); >> + destroy_physical_ctx(&p_ctx); >> return true; >> } >> >> diff --git a/controller/physical.c b/controller/physical.c >> index e93bfbd2cffb..c192aed751d5 100644 >> --- a/controller/physical.c >> +++ b/controller/physical.c >> @@ -71,6 +71,8 @@ struct tunnel { >> static void >> load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> const struct zone_ids *zone_ids, >> + size_t n_encap_ips, >> + const char **encap_ips, >> struct ofpbuf *ofpacts_p); >> static int64_t get_vxlan_port_key(int64_t port_key); >> >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf *ofpacts) >> */ >> static struct chassis_tunnel * >> get_port_binding_tun(const struct sbrec_encap *remote_encap, >> - const struct sbrec_encap *local_encap, >> const struct sbrec_chassis *chassis, >> - const struct hmap *chassis_tunnels) >> + const struct hmap *chassis_tunnels, >> + const char *local_encap_ip) >> { >> struct chassis_tunnel *tun = NULL; >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, >> remote_encap ? remote_encap->ip : NULL, >> - local_encap ? local_encap->ip : NULL); >> + local_encap_ip); >> >> if (!tun) { >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, NULL, NULL); >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct sbrec_port_binding *pb, >> static struct ovs_list * >> get_remote_tunnels(const struct sbrec_port_binding *binding, >> const struct sbrec_chassis *chassis, >> - const struct hmap *chassis_tunnels) >> + const struct hmap *chassis_tunnels, >> + const char *local_encap_ip) >> { >> const struct chassis_tunnel *tun; >> >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, >> ovs_list_init(tunnels); >> >> if (binding->chassis && binding->chassis != chassis) { >> - tun = get_port_binding_tun(binding->encap, NULL, binding->chassis, >> - chassis_tunnels); >> + tun = get_port_binding_tun(binding->encap, binding->chassis, >> + chassis_tunnels, local_encap_ip); >> if (!tun) { >> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); >> VLOG_WARN_RL( >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, >> } >> const struct sbrec_encap *additional_encap; >> additional_encap = find_additional_encap_for_chassis(binding, chassis); >> - tun = get_port_binding_tun(additional_encap, NULL, >> + tun = get_port_binding_tun(additional_encap, >> binding->additional_chassis[i], >> - chassis_tunnels); >> + chassis_tunnels, local_encap_ip); >> if (!tun) { >> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); >> VLOG_WARN_RL( >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct sbrec_port_binding *binding, >> struct ofpbuf *ofpacts_p, >> const struct sbrec_chassis *chassis, >> const struct hmap *chassis_tunnels, >> + size_t n_encap_ips, >> + const char **encap_ips, >> struct ovn_desired_flow_table *flow_table) >> { >> /* Setup encapsulation */ >> - struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> - chassis_tunnels); >> - if (!ovs_list_is_empty(tuns)) { >> - bool is_vtep_port = !strcmp(binding->type, "vtep"); >> - /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND >> - * responder in L3 networks. */ >> - if (is_vtep_port) { >> - put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_p); >> - } >> + for (size_t i = 0; i < n_encap_ips; i++) { >> + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); >> >> + >> + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16, >> + (uint32_t) 0xFFFF << 16); >> + struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> + chassis_tunnels, >> + encap_ips[i]); >> + if (!ovs_list_is_empty(tuns)) { >> + bool is_vtep_port = !strcmp(binding->type, "vtep"); >> + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND >> + * responder in L3 networks. */ >> + if (is_vtep_port) { >> + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, >> + ofpacts_clone); >> + } >> >> - struct tunnel *tun; >> - LIST_FOR_EACH (tun, list_node, tuns) { >> - put_encapsulation(mff_ovn_geneve, tun->tun, >> - binding->datapath, port_key, is_vtep_port, >> - ofpacts_p); >> - ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport; >> + struct tunnel *tun; >> + LIST_FOR_EACH (tun, list_node, tuns) { >> + put_encapsulation(mff_ovn_geneve, tun->tun, >> + binding->datapath, port_key, is_vtep_port, >> + ofpacts_clone); >> + ofpact_put_OUTPUT(ofpacts_clone)->port = tun->tun->ofport; >> + } >> + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); >> + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, >> + binding->header_.uuid.parts[0], match, >> + ofpacts_clone, &binding->header_.uuid); >> } >> - put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p); >> - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, >> - binding->header_.uuid.parts[0], match, ofpacts_p, >> - &binding->header_.uuid); >> - } >> - struct tunnel *tun_elem; >> - LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { >> - free(tun_elem); >> + struct tunnel *tun_elem; >> + LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { >> + free(tun_elem); >> + } >> + free(tuns); >> + >> + ofpbuf_delete(ofpacts_clone); >> } >> - free(tuns); >> } >> >> static void >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap *ct_zones, >> if (tag) { >> ofpact_put_STRIP_VLAN(ofpacts_p); >> } >> - load_logical_ingress_metadata(localnet_port, &zone_ids, ofpacts_p); >> + load_logical_ingress_metadata(localnet_port, &zone_ids, 0, NULL, >> + ofpacts_p); >> replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p); >> replace_mac->mac = router_port_mac; >> >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids, struct ofpbuf *ofpacts_p) >> { >> if (zone_ids) { >> if (zone_ids->ct) { >> - put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); >> + put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); >> } >> if (zone_ids->dnat) { >> put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key, >> } >> } >> >> +static size_t >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips, >> + const char *ip) >> +{ >> + for (size_t i = 0; i < n_encap_ips; i++) { >> + if (!strcmp(ip, encap_ips[i])) { >> + return i; >> + } >> + } >> + return 0; >> +} >> + >> static void >> load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> const struct zone_ids *zone_ids, >> + size_t n_encap_ips, >> + const char **encap_ips, >> struct ofpbuf *ofpacts_p) >> { >> put_zones_ofpacts(zone_ids, ofpacts_p); >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> uint32_t port_key = binding->tunnel_key; >> put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p); >> put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p); >> + >> + /* Default encap_id 0. */ >> + size_t encap_id = 0; >> + if (encap_ips && binding->encap) { >> + encap_id = encap_ip_to_id(n_encap_ips, encap_ips, >> + binding->encap->ip); >> + } >> + put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p); >> } >> >> static const struct sbrec_port_binding * >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct sbrec_port_binding *binding, >> match_set_dl_type(&match, htons(ETH_TYPE_RARP)); >> match_set_in_port(&match, ofport); >> >> - load_logical_ingress_metadata(binding, zone_ids, &ofpacts); >> + load_logical_ingress_metadata(binding, zone_ids, 0, NULL, &ofpacts); >> >> encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP, >> NX_CTLR_NO_METER, &ofpacts); >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports( >> } >> >> struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> - chassis_tunnels); >> + chassis_tunnels, NULL); >> if (ovs_list_is_empty(tuns)) { >> free(tuns); >> return; >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> const struct sbrec_chassis *chassis, >> const struct physical_debug *debug, >> const struct if_status_mgr *if_mgr, >> + size_t n_encap_ips, >> + const char **encap_ips, >> struct ovn_desired_flow_table *flow_table, >> struct ofpbuf *ofpacts_p) >> { >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> ofpact_put_CT_CLEAR(ofpacts_p); >> put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); >> put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p); >> - put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); >> + put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); >> struct zone_ids peer_zones = get_zone_ids(peer, ct_zones); >> - load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p); >> + load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips, >> + encap_ips, ofpacts_p); >> put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p); >> put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p); >> for (int i = 0; i < MFF_N_LOG_REGS; i++) { >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> * as we're going to remove this with ofpbuf_pull() later. */ >> uint32_t ofpacts_orig_size = ofpacts_p->size; >> >> - load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p); >> + load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips, >> + encap_ips, ofpacts_p); >> >> if (!strcmp(binding->type, "localport")) { >> /* mark the packet as incoming from a localport */ >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> } else { >> put_remote_port_redirect_overlay( >> binding, mff_ovn_geneve, port_key, &match, ofpacts_p, >> - chassis, chassis_tunnels, flow_table); >> + chassis, chassis_tunnels, n_encap_ips, encap_ips, flow_table); >> } >> out: >> if (ha_ch_ordered) { >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> int zone_id = simap_get(ct_zones, port->logical_port); >> if (zone_id) { >> - put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts); >> + put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts); >> } >> >> const char *lport_name = (port->parent_port && *port->parent_port) ? >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx *p_ctx, >> p_ctx->patch_ofports, >> p_ctx->chassis_tunnels, >> pb, p_ctx->chassis, &p_ctx->debug, >> - p_ctx->if_mgr, flow_table, &ofpacts); >> + p_ctx->if_mgr, >> + p_ctx->n_encap_ips, >> + p_ctx->encap_ips, >> + flow_table, &ofpacts); >> ofpbuf_uninit(&ofpacts); >> } >> >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx, >> p_ctx->patch_ofports, >> p_ctx->chassis_tunnels, binding, >> p_ctx->chassis, &p_ctx->debug, >> - p_ctx->if_mgr, flow_table, &ofpacts); >> + p_ctx->if_mgr, >> + p_ctx->n_encap_ips, >> + p_ctx->encap_ips, >> + flow_table, &ofpacts); >> } >> >> /* Handle output to multicast groups, in tables 40 and 41. */ >> diff --git a/controller/physical.h b/controller/physical.h >> index 1f1ed55efa16..7fe8ee3c18ed 100644 >> --- a/controller/physical.h >> +++ b/controller/physical.h >> @@ -66,6 +66,8 @@ struct physical_ctx { >> struct shash *local_bindings; >> struct simap *patch_ofports; >> struct hmap *chassis_tunnels; >> + size_t n_encap_ips; >> + const char **encap_ips; >> struct physical_debug debug; >> }; >> >> diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h >> index 272277ec4fa0..d3455a462a0d 100644 >> --- a/include/ovn/logical-fields.h >> +++ b/include/ovn/logical-fields.h >> @@ -37,7 +37,9 @@ enum ovn_controller_event { >> #define MFF_LOG_SNAT_ZONE MFF_REG12 /* conntrack snat zone for gateway router >> * (32 bits). */ >> #define MFF_LOG_CT_ZONE MFF_REG13 /* Logical conntrack zone for lports >> - * (32 bits). */ >> + * (0..15 of the 32 bits). */ >> +#define MFF_LOG_ENCAP_ID MFF_REG13 /* Encap ID for lports >> + * (16..31 of the 32 bits). */ >> #define MFF_LOG_INPORT MFF_REG14 /* Logical input port (32 bits). */ >> #define MFF_LOG_OUTPORT MFF_REG15 /* Logical output port (32 bits). */ >> >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml >> index 96294fe2b795..bfd8680cedfc 100644 >> --- a/ovn-architecture.7.xml >> +++ b/ovn-architecture.7.xml >> @@ -1247,8 +1247,8 @@ >> chassis. This is initialized to 0 at the beginning of the logical >> <!-- Keep the following in sync with MFF_LOG_CT_ZONE in >> ovn/lib/logical-fields.h. --> >> - ingress pipeline. OVN stores this in Open vSwitch extension register >> - number 13. >> + ingress pipeline. OVN stores this in the lower 16 bits of the Open >> + vSwitch extension register number 13. >> </dd> >> >> <dt>conntrack zone fields for routers</dt> >> @@ -1263,6 +1263,20 @@ >> traffic (for SNATing) in Open vSwitch extension register number 12. >> </dd> >> >> + <dt>Encap ID for logical ports</dt> >> + <dd> >> + A field that records an ID that indicates which encapsulation IP should >> + be used when sending packets to a remote chassis, according to the >> + original input logical port. This is useful when there are multiple IPs >> + available for encapsulation. The value only has local significance and is >> + not meaningful between chassis. This is initialized to 0 at the beginning >> + of the logical >> + <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in >> + ovn/lib/logical-fields.h. --> >> + ingress pipeline. OVN stores this in the higher 16 bits of the Open >> + vSwitch extension register number 13. >> + </dd> >> + >> <dt>logical flow flags</dt> >> <dd> >> The logical flags are intended to handle keeping context between >> diff --git a/tests/ovn.at b/tests/ovn.at >> index 243fe0b8246c..e7f0c9681f60 100644 >> --- a/tests/ovn.at >> +++ b/tests/ovn.at >> @@ -30335,19 +30335,33 @@ AT_CLEANUP >> >> >> OVN_FOR_EACH_NORTHD([ >> -AT_SETUP([multiple encap ips tunnel creation]) >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip]) >> +AT_SKIP_IF([test $HAVE_SCAPY = no]) >> ovn_start >> net_add n1 >> >> +ovn-nbctl ls-add ls1 >> + >> # 2 HVs, each with 2 encap-ips. >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY. >> for i in 1 2; do >> sim_add hv$i >> as hv$i >> ovs-vsctl add-br br-phys-$j >> ovn_attach n1 br-phys-$j 192.168.0.${i}1 >> ovs-vsctl set open . external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 >> + >> + for j in 1 2; do >> + ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ >> + external_ids:iface-id=lsp$i$j \ >> + external_ids:encap-ip=192.168.0.$i$j \ >> + options:tx_pcap=hv$i/vif$i$j-tx.pcap options:rxq_pcap=hv$i/vif$i$j-rx.pcap >> + ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j "f0:00:00:00:00:$i$j 10.0.0.$i$j" >> + >> + done >> done >> >> +wait_for_ports_up >> check ovn-nbctl --wait=hv sync >> >> check_tunnel_port() { >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12 %192.168.0.21 >> check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22 >> check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22 >> >> +vif_to_hv() { >> + case $1 in dnl ( >> + vif[[1]]?) echo hv1 ;; dnl ( >> + vif[[2]]?) echo hv2 ;; dnl ( >> + *) AT_FAIL_IF([:]) ;; >> + esac >> +} >> + >> +# check_packet_tunnel SRC_VIF DST_VIF >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the corresponding >> +# tunnel that matches the VIFs' encap_ip configurations. >> +check_packet_tunnel() { >> + local src=$1 dst=$2 >> + local src_mac=f0:00:00:00:00:$src >> + local dst_mac=f0:00:00:00:00:$dst >> + local src_ip=10.0.0.$src >> + local dst_ip=10.0.0.$dst >> + local local_encap_ip=192.168.0.$src >> + local remote_encap_ip=192.168.0.$dst >> + local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ >> + IP(dst='${dst_ip}', src='${src_ip}')/ \ >> + ICMP(type=8)") >> + hv=`vif_to_hv vif$src` >> + as $hv >> + echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> $remote_encap_ip" >> + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) >> + AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) >> +} >> + >> +for i in 1 2; do >> + for j in 1 2; do >> + check_packet_tunnel 1$i 2$j >> + done >> +done >> + >> OVN_CLEANUP([hv1],[hv2]) >> AT_CLEANUP >> ]) >> -- >> 2.38.1 >> >> _______________________________________________ >> dev mailing list >> dev@openvswitch.org >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >> > > Thanks, > Ales > > -- > > Ales Musil > > Senior Software Engineer - OVN Core > > Red Hat EMEA > > amusil@redhat.com
On Fri, Jan 26, 2024 at 4:07 AM Han Zhou <hzhou@ovn.org> wrote: > > > On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote: > > > > > > > > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote: > >> > >> Commit dd527a283cd8 partially supported multiple encap IPs. It supported > >> remote encap IP selection based on the destination VIF's encap_ip > >> configuration. This patch adds the support for selecting local encap IP > >> based on the source VIF's encap_ip configuration. > >> > >> Co-authored-by: Lei Huang <leih@nvidia.com> > >> Signed-off-by: Lei Huang <leih@nvidia.com> > >> Signed-off-by: Han Zhou <hzhou@ovn.org> > >> --- > > > > > > Hi Han and Lei, > > > > thank you for the patch, I have a couple of comments/questions down > below. > > > Thanks Ales. > > > > > > >> NEWS | 3 + > >> controller/chassis.c | 2 +- > >> controller/local_data.c | 2 +- > >> controller/local_data.h | 2 +- > >> controller/ovn-controller.8.xml | 30 ++++++- > >> controller/ovn-controller.c | 49 ++++++++++++ > >> controller/physical.c | 134 ++++++++++++++++++++++---------- > >> controller/physical.h | 2 + > >> include/ovn/logical-fields.h | 4 +- > >> ovn-architecture.7.xml | 18 ++++- > >> tests/ovn.at | 51 +++++++++++- > >> 11 files changed, 243 insertions(+), 54 deletions(-) > >> > >> diff --git a/NEWS b/NEWS > >> index 5f267b4c64cc..5a3eed608617 100644 > >> --- a/NEWS > >> +++ b/NEWS > >> @@ -14,6 +14,9 @@ Post v23.09.0 > >> - ovn-northd-ddlog has been removed. > >> - A new LSP option "enable_router_port_acl" has been added to enable > >> conntrack for the router port whose peer is l3dgw_port if set it > true. > >> + - Support selecting encapsulation IP based on the source/destination > VIF's > >> + settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for > more > >> + details. > >> > >> OVN v23.09.0 - 15 Sep 2023 > >> -------------------------- > >> diff --git a/controller/chassis.c b/controller/chassis.c > >> index a6f13ccc42d5..55f2beb37674 100644 > >> --- a/controller/chassis.c > >> +++ b/controller/chassis.c > >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg { > >> > >> /* Set of encap types parsed from the 'ovn-encap-type' > external-id. */ > >> struct sset encap_type_set; > >> - /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. > */ > >> + /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */ > >> struct sset encap_ip_set; > >> /* Interface type list formatted in the OVN-SB Chassis required > format. */ > >> struct ds iface_types; > >> diff --git a/controller/local_data.c b/controller/local_data.c > >> index a9092783958f..8606414f8728 100644 > >> --- a/controller/local_data.c > >> +++ b/controller/local_data.c > >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap > *chassis_tunnels) > >> */ > >> struct chassis_tunnel * > >> chassis_tunnel_find(const struct hmap *chassis_tunnels, const char > *chassis_id, > >> - char *remote_encap_ip, char *local_encap_ip) > >> + char *remote_encap_ip, const char *local_encap_ip) > > > > > > nit: Unrelated change. > > > Ack > > > > > > >> { > >> /* > >> * If the specific encap_ip is given, look for the chassisid_ip > entry, > >> diff --git a/controller/local_data.h b/controller/local_data.h > >> index bab95bcc3824..ca3905bd20e6 100644 > >> --- a/controller/local_data.h > >> +++ b/controller/local_data.h > >> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes( > >> struct chassis_tunnel *chassis_tunnel_find(const struct hmap > *chassis_tunnels, > >> const char *chassis_id, > >> char *remote_encap_ip, > >> - char *local_encap_ip); > >> + const char *local_encap_ip); > > > > > > Same as above. > > > Ack > > > > > > >> > >> bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels, > >> const char *chassis_name, > >> diff --git a/controller/ovn-controller.8.xml > b/controller/ovn-controller.8.xml > >> index efa65e3fd927..5ebef048d721 100644 > >> --- a/controller/ovn-controller.8.xml > >> +++ b/controller/ovn-controller.8.xml > >> @@ -176,10 +176,32 @@ > >> > >> <dt><code>external_ids:ovn-encap-ip</code></dt> > >> <dd> > >> - The IP address that a chassis should use to connect to this > node > >> - using encapsulation types specified by > >> - <code>external_ids:ovn-encap-type</code>. Multiple > encapsulation IPs > >> - may be specified with a comma-separated list. > >> + <p> > >> + The IP address that a chassis should use to connect to this > node > >> + using encapsulation types specified by > >> + <code>external_ids:ovn-encap-type</code>. Multiple > encapsulation IPs > >> + may be specified with a comma-separated list. > >> + </p> > >> + <p> > >> + In scenarios where multiple encapsulation IPs are present, > distinct > >> + tunnels are established for each remote chassis. These > tunnels are > >> + differentiated by setting unique > <code>options:local_ip</code> and > >> + <code>options:remote_ip</code> values in the tunnel > interface. When > >> + transmitting a packet to a remote chassis, the selection of > local_ip > >> + is guided by the > <code>Interface:external_ids:encap-ip</code> from > >> + the local OVSDB, corresponding to the VIF originating the > packet, if > >> + specified. The <code>Interface:external_ids:encap-ip</code> > setting > >> + of the VIF is also populated to the <code>Port_Binding</code> > >> + table in the OVN SB database via the <code>encap</code> > column. > >> + Consequently, when a remote chassis needs to send a packet > to a > >> + port-binding associated with this VIF, it utilizes the > tunnel with > >> + the appropriate <code>options:remote_ip</code> that matches > the > >> + <code>ip</code> in <code>Port_Binding:encap</code>. This > mechanism > >> + is particularly beneficial for chassis with multiple physical > >> + interfaces designated for tunneling, where each interface is > >> + optimized for handling specific traffic associated with > particular > >> + VIFs. > >> + </p> > >> </dd> > >> > >> <dt><code>external_ids:ovn-encap-df_default</code></dt> > >> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > >> index 856e5e270822..4ab57fe970fe 100644 > >> --- a/controller/ovn-controller.c > >> +++ b/controller/ovn-controller.c > >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output { > >> struct physical_debug debug; > >> }; > >> > >> +static void > >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, > >> + size_t *n_encap_ips, const char * **encap_ips) > >> +{ > >> + const struct ovsrec_open_vswitch *cfg = > >> + ovsrec_open_vswitch_table_first(ovs_table); > >> + const char *chassis_id = get_ovs_chassis_id(ovs_table); > >> + const char *encap_ips_str = > >> + get_chassis_external_id_value(&cfg->external_ids, chassis_id, > >> + "ovn-encap-ip", NULL); > >> + struct sset encap_ip_set; > >> + sset_init(&encap_ip_set); > >> + sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); > >> + > >> + /* Sort the ips so that their index is deterministic. */ > >> + *encap_ips = sset_sort(&encap_ip_set); > >> + > >> + /* Copy the strings so that we can destroy the sset. */ > >> + for (size_t i = 0; (*encap_ips)[i]; i++) { > >> + (*encap_ips)[i] = xstrdup((*encap_ips)[i]); > >> + } > >> + *n_encap_ips = sset_count(&encap_ip_set); > >> + sset_destroy(&encap_ip_set); > >> +} > >> + > > > > > > I wonder, could we store this array or maybe preferably svec in > "en_non_vif_data" I-P node? This way it would be handled in the node and we > could avoid the destroy calls after any pflow processing WDYT? > > > Yes we could do the same in en_non_vif_data, but I think it is not really > necessary and it seems more straightforward just parsing it here, because: > 1. We don't need I-P for this ovn-encap-ip configuration change, so we > don't have to persist this data. > 2. It should be very rare to have more than 5 (or even 3) encap IPs per > node, so the list should be very small and the cost of this parsing (and > sset construct/destroy) is negligible. > Using svec is also a valid option, but the sset (and array) is used here > just to reuse the sset_from_delimited_string and sset_sort for convenience. > I noticed that the sset_init() call can in fact be removed because > sset_from_delimited_string already does that. I can remove it. > Does this sound reasonable to you? > It makes sense, the main thing it would help with is the need to destroy the ctx in kinda unexpected places which might be slightly error prone. However, it being functionally the same it's fine either way. Thanks, Ales > > Thanks, > Han > > > > >> > >> static void init_physical_ctx(struct engine_node *node, > >> struct ed_type_runtime_data *rt_data, > >> struct ed_type_non_vif_data > *non_vif_data, > >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node > *node, > >> engine_get_input_data("ct_zones", node); > >> struct simap *ct_zones = &ct_zones_data->current; > >> > >> + parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips); > >> p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; > >> p_ctx->sbrec_port_binding_by_datapath = > sbrec_port_binding_by_datapath; > >> p_ctx->port_binding_table = port_binding_table; > >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct > engine_node *node, > >> p_ctx->patch_ofports = &non_vif_data->patch_ofports; > >> p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; > >> > >> + > >> + > >> > > > > nit: Unrelated change. > > > Ack > > > > > >> > >> struct controller_engine_ctx *ctrl_ctx = > engine_get_context()->client_ctx; > >> p_ctx->if_mgr = ctrl_ctx->if_mgr; > >> > >> pflow_output_get_debug(node, &p_ctx->debug); > >> } > >> > >> +static void > >> +destroy_physical_ctx(struct physical_ctx *p_ctx) > >> +{ > >> + for (size_t i = 0; i < p_ctx->n_encap_ips; i++) { > >> + free((char *)(p_ctx->encap_ips[i])); > >> + } > >> + free(p_ctx->encap_ips); > >> +} > >> + > >> static void * > >> en_pflow_output_init(struct engine_node *node OVS_UNUSED, > >> struct engine_arg *arg OVS_UNUSED) > >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, > void *data) > >> struct physical_ctx p_ctx; > >> init_physical_ctx(node, rt_data, non_vif_data, &p_ctx); > >> physical_run(&p_ctx, pflow_table); > >> + destroy_physical_ctx(&p_ctx); > >> > >> engine_set_node_state(node, EN_UPDATED); > >> } > >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct > engine_node *node, > >> bool removed = sbrec_port_binding_is_deleted(binding); > >> if (!physical_handle_flows_for_lport(binding, removed, > &p_ctx, > >> > &pfo->flow_table)) { > >> + destroy_physical_ctx(&p_ctx); > >> return false; > >> } > >> } > >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct > engine_node *node, > >> bool removed = sbrec_port_binding_is_deleted(pb); > >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, > >> &pfo->flow_table)) { > >> + destroy_physical_ctx(&p_ctx); > >> return false; > >> } > >> } > >> engine_set_node_state(node, EN_UPDATED); > >> } > >> + destroy_physical_ctx(&p_ctx); > >> return true; > >> } > >> > >> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct > engine_node *node, > >> bool removed = sbrec_port_binding_is_deleted(pb); > >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, > >> &pfo->flow_table)) { > >> + destroy_physical_ctx(&p_ctx); > >> return false; > >> } > >> } > >> > >> engine_set_node_state(node, EN_UPDATED); > >> + destroy_physical_ctx(&p_ctx); > >> return true; > >> } > >> > >> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct > engine_node *node, void *data) > >> physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table); > >> > >> engine_set_node_state(node, EN_UPDATED); > >> + destroy_physical_ctx(&p_ctx); > >> return true; > >> } > >> > >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct > engine_node *node, void *data) > >> if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) { > >> /* Fall back to full recompute when a local datapath > >> * is added or deleted. */ > >> + destroy_physical_ctx(&p_ctx); > >> return false; > >> } > >> > >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct > engine_node *node, void *data) > >> lport->tracked_type == TRACKED_RESOURCE_REMOVED ? > true: false; > >> if (!physical_handle_flows_for_lport(lport->pb, removed, > &p_ctx, > >> &pfo->flow_table)) { > >> + destroy_physical_ctx(&p_ctx); > >> return false; > >> } > >> } > >> } > >> > >> engine_set_node_state(node, EN_UPDATED); > >> + destroy_physical_ctx(&p_ctx); > >> return true; > >> } > >> > >> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct > engine_node *node, void *data) > >> if (pb) { > >> if (!physical_handle_flows_for_lport(pb, false, &p_ctx, > >> &pfo->flow_table)) { > >> + destroy_physical_ctx(&p_ctx); > >> return false; > >> } > >> tag_port_as_activated_in_engine(pp); > >> } > >> } > >> engine_set_node_state(node, EN_UPDATED); > >> + destroy_physical_ctx(&p_ctx); > >> return true; > >> } > >> > >> diff --git a/controller/physical.c b/controller/physical.c > >> index e93bfbd2cffb..c192aed751d5 100644 > >> --- a/controller/physical.c > >> +++ b/controller/physical.c > >> @@ -71,6 +71,8 @@ struct tunnel { > >> static void > >> load_logical_ingress_metadata(const struct sbrec_port_binding *binding, > >> const struct zone_ids *zone_ids, > >> + size_t n_encap_ips, > >> + const char **encap_ips, > >> struct ofpbuf *ofpacts_p); > >> static int64_t get_vxlan_port_key(int64_t port_key); > >> > >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf > *ofpacts) > >> */ > >> static struct chassis_tunnel * > >> get_port_binding_tun(const struct sbrec_encap *remote_encap, > >> - const struct sbrec_encap *local_encap, > >> const struct sbrec_chassis *chassis, > >> - const struct hmap *chassis_tunnels) > >> + const struct hmap *chassis_tunnels, > >> + const char *local_encap_ip) > >> { > >> struct chassis_tunnel *tun = NULL; > >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, > >> remote_encap ? remote_encap->ip : NULL, > >> - local_encap ? local_encap->ip : NULL); > >> + local_encap_ip); > >> > >> if (!tun) { > >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, > NULL, NULL); > >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct > sbrec_port_binding *pb, > >> static struct ovs_list * > >> get_remote_tunnels(const struct sbrec_port_binding *binding, > >> const struct sbrec_chassis *chassis, > >> - const struct hmap *chassis_tunnels) > >> + const struct hmap *chassis_tunnels, > >> + const char *local_encap_ip) > >> { > >> const struct chassis_tunnel *tun; > >> > >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding > *binding, > >> ovs_list_init(tunnels); > >> > >> if (binding->chassis && binding->chassis != chassis) { > >> - tun = get_port_binding_tun(binding->encap, NULL, > binding->chassis, > >> - chassis_tunnels); > >> + tun = get_port_binding_tun(binding->encap, binding->chassis, > >> + chassis_tunnels, local_encap_ip); > >> if (!tun) { > >> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, > 1); > >> VLOG_WARN_RL( > >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding > *binding, > >> } > >> const struct sbrec_encap *additional_encap; > >> additional_encap = find_additional_encap_for_chassis(binding, > chassis); > >> - tun = get_port_binding_tun(additional_encap, NULL, > >> + tun = get_port_binding_tun(additional_encap, > >> binding->additional_chassis[i], > >> - chassis_tunnels); > >> + chassis_tunnels, local_encap_ip); > >> if (!tun) { > >> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, > 1); > >> VLOG_WARN_RL( > >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct > sbrec_port_binding *binding, > >> struct ofpbuf *ofpacts_p, > >> const struct sbrec_chassis *chassis, > >> const struct hmap *chassis_tunnels, > >> + size_t n_encap_ips, > >> + const char **encap_ips, > >> struct ovn_desired_flow_table > *flow_table) > >> { > >> /* Setup encapsulation */ > >> - struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > >> - chassis_tunnels); > >> - if (!ovs_list_is_empty(tuns)) { > >> - bool is_vtep_port = !strcmp(binding->type, "vtep"); > >> - /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for > ARP/ND > >> - * responder in L3 networks. */ > >> - if (is_vtep_port) { > >> - put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, > ofpacts_p); > >> - } > >> + for (size_t i = 0; i < n_encap_ips; i++) { > >> + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); > >> > >> + > >> + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << > 16, > >> + (uint32_t) 0xFFFF << 16); > >> + struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > >> + chassis_tunnels, > >> + encap_ips[i]); > >> + if (!ovs_list_is_empty(tuns)) { > >> + bool is_vtep_port = !strcmp(binding->type, "vtep"); > >> + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check > for ARP/ND > >> + * responder in L3 networks. */ > >> + if (is_vtep_port) { > >> + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, > >> + ofpacts_clone); > >> + } > >> > >> - struct tunnel *tun; > >> - LIST_FOR_EACH (tun, list_node, tuns) { > >> - put_encapsulation(mff_ovn_geneve, tun->tun, > >> - binding->datapath, port_key, > is_vtep_port, > >> - ofpacts_p); > >> - ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport; > >> + struct tunnel *tun; > >> + LIST_FOR_EACH (tun, list_node, tuns) { > >> + put_encapsulation(mff_ovn_geneve, tun->tun, > >> + binding->datapath, port_key, > is_vtep_port, > >> + ofpacts_clone); > >> + ofpact_put_OUTPUT(ofpacts_clone)->port = > tun->tun->ofport; > >> + } > >> + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); > >> + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, > >> + binding->header_.uuid.parts[0], match, > >> + ofpacts_clone, &binding->header_.uuid); > >> } > >> - put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p); > >> - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, > >> - binding->header_.uuid.parts[0], match, > ofpacts_p, > >> - &binding->header_.uuid); > >> - } > >> - struct tunnel *tun_elem; > >> - LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { > >> - free(tun_elem); > >> + struct tunnel *tun_elem; > >> + LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { > >> + free(tun_elem); > >> + } > >> + free(tuns); > >> + > >> + ofpbuf_delete(ofpacts_clone); > >> } > >> - free(tuns); > >> } > >> > >> static void > >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap > *ct_zones, > >> if (tag) { > >> ofpact_put_STRIP_VLAN(ofpacts_p); > >> } > >> - load_logical_ingress_metadata(localnet_port, &zone_ids, > ofpacts_p); > >> + load_logical_ingress_metadata(localnet_port, &zone_ids, 0, > NULL, > >> + ofpacts_p); > >> replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p); > >> replace_mac->mac = router_port_mac; > >> > >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids, > struct ofpbuf *ofpacts_p) > >> { > >> if (zone_ids) { > >> if (zone_ids->ct) { > >> - put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); > >> + put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); > >> } > >> if (zone_ids->dnat) { > >> put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, > ofpacts_p); > >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key, > >> } > >> } > >> > >> +static size_t > >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips, > >> + const char *ip) > >> +{ > >> + for (size_t i = 0; i < n_encap_ips; i++) { > >> + if (!strcmp(ip, encap_ips[i])) { > >> + return i; > >> + } > >> + } > >> + return 0; > >> +} > >> + > >> static void > >> load_logical_ingress_metadata(const struct sbrec_port_binding *binding, > >> const struct zone_ids *zone_ids, > >> + size_t n_encap_ips, > >> + const char **encap_ips, > >> struct ofpbuf *ofpacts_p) > >> { > >> put_zones_ofpacts(zone_ids, ofpacts_p); > >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct > sbrec_port_binding *binding, > >> uint32_t port_key = binding->tunnel_key; > >> put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p); > >> put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p); > >> + > >> + /* Default encap_id 0. */ > >> + size_t encap_id = 0; > >> + if (encap_ips && binding->encap) { > >> + encap_id = encap_ip_to_id(n_encap_ips, encap_ips, > >> + binding->encap->ip); > >> + } > >> + put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p); > >> } > >> > >> static const struct sbrec_port_binding * > >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct > sbrec_port_binding *binding, > >> match_set_dl_type(&match, htons(ETH_TYPE_RARP)); > >> match_set_in_port(&match, ofport); > >> > >> - load_logical_ingress_metadata(binding, zone_ids, &ofpacts); > >> + load_logical_ingress_metadata(binding, zone_ids, 0, NULL, > &ofpacts); > >> > >> encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP, > >> NX_CTLR_NO_METER, &ofpacts); > >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports( > >> } > >> > >> struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > >> - chassis_tunnels); > >> + chassis_tunnels, NULL); > >> if (ovs_list_is_empty(tuns)) { > >> free(tuns); > >> return; > >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> const struct sbrec_chassis *chassis, > >> const struct physical_debug *debug, > >> const struct if_status_mgr *if_mgr, > >> + size_t n_encap_ips, > >> + const char **encap_ips, > >> struct ovn_desired_flow_table *flow_table, > >> struct ofpbuf *ofpacts_p) > >> { > >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> ofpact_put_CT_CLEAR(ofpacts_p); > >> put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); > >> put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p); > >> - put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); > >> + put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); > >> struct zone_ids peer_zones = get_zone_ids(peer, ct_zones); > >> - load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p); > >> + load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips, > >> + encap_ips, ofpacts_p); > >> put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p); > >> put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p); > >> for (int i = 0; i < MFF_N_LOG_REGS; i++) { > >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> * as we're going to remove this with ofpbuf_pull() later. */ > >> uint32_t ofpacts_orig_size = ofpacts_p->size; > >> > >> - load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p); > >> + load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips, > >> + encap_ips, ofpacts_p); > >> > >> if (!strcmp(binding->type, "localport")) { > >> /* mark the packet as incoming from a localport */ > >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> } else { > >> put_remote_port_redirect_overlay( > >> binding, mff_ovn_geneve, port_key, &match, ofpacts_p, > >> - chassis, chassis_tunnels, flow_table); > >> + chassis, chassis_tunnels, n_encap_ips, encap_ips, > flow_table); > >> } > >> out: > >> if (ha_ch_ordered) { > >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> > >> int zone_id = simap_get(ct_zones, port->logical_port); > >> if (zone_id) { > >> - put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts); > >> + put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts); > >> } > >> > >> const char *lport_name = (port->parent_port && > *port->parent_port) ? > >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx > *p_ctx, > >> p_ctx->patch_ofports, > >> p_ctx->chassis_tunnels, > >> pb, p_ctx->chassis, &p_ctx->debug, > >> - p_ctx->if_mgr, flow_table, &ofpacts); > >> + p_ctx->if_mgr, > >> + p_ctx->n_encap_ips, > >> + p_ctx->encap_ips, > >> + flow_table, &ofpacts); > >> ofpbuf_uninit(&ofpacts); > >> } > >> > >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx, > >> p_ctx->patch_ofports, > >> p_ctx->chassis_tunnels, binding, > >> p_ctx->chassis, &p_ctx->debug, > >> - p_ctx->if_mgr, flow_table, &ofpacts); > >> + p_ctx->if_mgr, > >> + p_ctx->n_encap_ips, > >> + p_ctx->encap_ips, > >> + flow_table, &ofpacts); > >> } > >> > >> /* Handle output to multicast groups, in tables 40 and 41. */ > >> diff --git a/controller/physical.h b/controller/physical.h > >> index 1f1ed55efa16..7fe8ee3c18ed 100644 > >> --- a/controller/physical.h > >> +++ b/controller/physical.h > >> @@ -66,6 +66,8 @@ struct physical_ctx { > >> struct shash *local_bindings; > >> struct simap *patch_ofports; > >> struct hmap *chassis_tunnels; > >> + size_t n_encap_ips; > >> + const char **encap_ips; > >> struct physical_debug debug; > >> }; > >> > >> diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h > >> index 272277ec4fa0..d3455a462a0d 100644 > >> --- a/include/ovn/logical-fields.h > >> +++ b/include/ovn/logical-fields.h > >> @@ -37,7 +37,9 @@ enum ovn_controller_event { > >> #define MFF_LOG_SNAT_ZONE MFF_REG12 /* conntrack snat zone for > gateway router > >> * (32 bits). */ > >> #define MFF_LOG_CT_ZONE MFF_REG13 /* Logical conntrack zone for > lports > >> - * (32 bits). */ > >> + * (0..15 of the 32 bits). */ > >> +#define MFF_LOG_ENCAP_ID MFF_REG13 /* Encap ID for lports > >> + * (16..31 of the 32 bits). */ > >> #define MFF_LOG_INPORT MFF_REG14 /* Logical input port (32 bits). > */ > >> #define MFF_LOG_OUTPORT MFF_REG15 /* Logical output port (32 > bits). */ > >> > >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml > >> index 96294fe2b795..bfd8680cedfc 100644 > >> --- a/ovn-architecture.7.xml > >> +++ b/ovn-architecture.7.xml > >> @@ -1247,8 +1247,8 @@ > >> chassis. This is initialized to 0 at the beginning of the > logical > >> <!-- Keep the following in sync with MFF_LOG_CT_ZONE in > >> ovn/lib/logical-fields.h. --> > >> - ingress pipeline. OVN stores this in Open vSwitch extension > register > >> - number 13. > >> + ingress pipeline. OVN stores this in the lower 16 bits of the > Open > >> + vSwitch extension register number 13. > >> </dd> > >> > >> <dt>conntrack zone fields for routers</dt> > >> @@ -1263,6 +1263,20 @@ > >> traffic (for SNATing) in Open vSwitch extension register number > 12. > >> </dd> > >> > >> + <dt>Encap ID for logical ports</dt> > >> + <dd> > >> + A field that records an ID that indicates which encapsulation IP > should > >> + be used when sending packets to a remote chassis, according to > the > >> + original input logical port. This is useful when there are > multiple IPs > >> + available for encapsulation. The value only has local > significance and is > >> + not meaningful between chassis. This is initialized to 0 at the > beginning > >> + of the logical > >> + <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in > >> + ovn/lib/logical-fields.h. --> > >> + ingress pipeline. OVN stores this in the higher 16 bits of the > Open > >> + vSwitch extension register number 13. > >> + </dd> > >> + > >> <dt>logical flow flags</dt> > >> <dd> > >> The logical flags are intended to handle keeping context between > >> diff --git a/tests/ovn.at b/tests/ovn.at > >> index 243fe0b8246c..e7f0c9681f60 100644 > >> --- a/tests/ovn.at > >> +++ b/tests/ovn.at > >> @@ -30335,19 +30335,33 @@ AT_CLEANUP > >> > >> > >> OVN_FOR_EACH_NORTHD([ > >> -AT_SETUP([multiple encap ips tunnel creation]) > >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip]) > >> +AT_SKIP_IF([test $HAVE_SCAPY = no]) > >> ovn_start > >> net_add n1 > >> > >> +ovn-nbctl ls-add ls1 > >> + > >> # 2 HVs, each with 2 encap-ips. > >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY. > >> for i in 1 2; do > >> sim_add hv$i > >> as hv$i > >> ovs-vsctl add-br br-phys-$j > >> ovn_attach n1 br-phys-$j 192.168.0.${i}1 > >> ovs-vsctl set open . > external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 > >> + > >> + for j in 1 2; do > >> + ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ > >> + external_ids:iface-id=lsp$i$j \ > >> + external_ids:encap-ip=192.168.0.$i$j \ > >> + options:tx_pcap=hv$i/vif$i$j-tx.pcap > options:rxq_pcap=hv$i/vif$i$j-rx.pcap > >> + ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j > "f0:00:00:00:00:$i$j 10.0.0.$i$j" > >> + > >> + done > >> done > >> > >> +wait_for_ports_up > >> check ovn-nbctl --wait=hv sync > >> > >> check_tunnel_port() { > >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12 > %192.168.0.21 > >> check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22 > >> check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22 > >> > >> +vif_to_hv() { > >> + case $1 in dnl ( > >> + vif[[1]]?) echo hv1 ;; dnl ( > >> + vif[[2]]?) echo hv2 ;; dnl ( > >> + *) AT_FAIL_IF([:]) ;; > >> + esac > >> +} > >> + > >> +# check_packet_tunnel SRC_VIF DST_VIF > >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the > corresponding > >> +# tunnel that matches the VIFs' encap_ip configurations. > >> +check_packet_tunnel() { > >> + local src=$1 dst=$2 > >> + local src_mac=f0:00:00:00:00:$src > >> + local dst_mac=f0:00:00:00:00:$dst > >> + local src_ip=10.0.0.$src > >> + local dst_ip=10.0.0.$dst > >> + local local_encap_ip=192.168.0.$src > >> + local remote_encap_ip=192.168.0.$dst > >> + local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ > \ > >> + IP(dst='${dst_ip}', src='${src_ip}')/ \ > >> + ICMP(type=8)") > >> + hv=`vif_to_hv vif$src` > >> + as $hv > >> + echo "vif$src -> vif$dst should go through tunnel $local_encap_ip > -> $remote_encap_ip" > >> + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > >> + AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src > $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) > >> +} > >> + > >> +for i in 1 2; do > >> + for j in 1 2; do > >> + check_packet_tunnel 1$i 2$j > >> + done > >> +done > >> + > >> OVN_CLEANUP([hv1],[hv2]) > >> AT_CLEANUP > >> ]) > >> -- > >> 2.38.1 > >> > >> _______________________________________________ > >> dev mailing list > >> dev@openvswitch.org > >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev > >> > > > > Thanks, > > Ales > > > > -- > > > > Ales Musil > > > > Senior Software Engineer - OVN Core > > > > Red Hat EMEA > > > > amusil@redhat.com >
On Thu, Jan 25, 2024 at 10:54 PM Ales Musil <amusil@redhat.com> wrote: > > > > On Fri, Jan 26, 2024 at 4:07 AM Han Zhou <hzhou@ovn.org> wrote: >> >> >> >> On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote: >> > >> > >> > >> > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote: >> >> >> >> Commit dd527a283cd8 partially supported multiple encap IPs. It supported >> >> remote encap IP selection based on the destination VIF's encap_ip >> >> configuration. This patch adds the support for selecting local encap IP >> >> based on the source VIF's encap_ip configuration. >> >> >> >> Co-authored-by: Lei Huang <leih@nvidia.com> >> >> Signed-off-by: Lei Huang <leih@nvidia.com> >> >> Signed-off-by: Han Zhou <hzhou@ovn.org> >> >> --- >> > >> > >> > Hi Han and Lei, >> > >> > thank you for the patch, I have a couple of comments/questions down below. >> >> >> Thanks Ales. >> >> > >> > >> >> NEWS | 3 + >> >> controller/chassis.c | 2 +- >> >> controller/local_data.c | 2 +- >> >> controller/local_data.h | 2 +- >> >> controller/ovn-controller.8.xml | 30 ++++++- >> >> controller/ovn-controller.c | 49 ++++++++++++ >> >> controller/physical.c | 134 ++++++++++++++++++++++---------- >> >> controller/physical.h | 2 + >> >> include/ovn/logical-fields.h | 4 +- >> >> ovn-architecture.7.xml | 18 ++++- >> >> tests/ovn.at | 51 +++++++++++- >> >> 11 files changed, 243 insertions(+), 54 deletions(-) >> >> >> >> diff --git a/NEWS b/NEWS >> >> index 5f267b4c64cc..5a3eed608617 100644 >> >> --- a/NEWS >> >> +++ b/NEWS >> >> @@ -14,6 +14,9 @@ Post v23.09.0 >> >> - ovn-northd-ddlog has been removed. >> >> - A new LSP option "enable_router_port_acl" has been added to enable >> >> conntrack for the router port whose peer is l3dgw_port if set it true. >> >> + - Support selecting encapsulation IP based on the source/destination VIF's >> >> + settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for more >> >> + details. >> >> >> >> OVN v23.09.0 - 15 Sep 2023 >> >> -------------------------- >> >> diff --git a/controller/chassis.c b/controller/chassis.c >> >> index a6f13ccc42d5..55f2beb37674 100644 >> >> --- a/controller/chassis.c >> >> +++ b/controller/chassis.c >> >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg { >> >> >> >> /* Set of encap types parsed from the 'ovn-encap-type' external-id. */ >> >> struct sset encap_type_set; >> >> - /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. */ >> >> + /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */ >> >> struct sset encap_ip_set; >> >> /* Interface type list formatted in the OVN-SB Chassis required format. */ >> >> struct ds iface_types; >> >> diff --git a/controller/local_data.c b/controller/local_data.c >> >> index a9092783958f..8606414f8728 100644 >> >> --- a/controller/local_data.c >> >> +++ b/controller/local_data.c >> >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap *chassis_tunnels) >> >> */ >> >> struct chassis_tunnel * >> >> chassis_tunnel_find(const struct hmap *chassis_tunnels, const char *chassis_id, >> >> - char *remote_encap_ip, char *local_encap_ip) >> >> + char *remote_encap_ip, const char *local_encap_ip) >> > >> > >> > nit: Unrelated change. >> >> >> Ack Hi Ales, sorry that I just realized this change is related, which is because of the const char * array introduced in this patch that stores the parsed encap_ips, and it makes sense to use const char * since we should never expect this string to be modified in the function. >> >> > >> > >> >> { >> >> /* >> >> * If the specific encap_ip is given, look for the chassisid_ip entry, >> >> diff --git a/controller/local_data.h b/controller/local_data.h >> >> index bab95bcc3824..ca3905bd20e6 100644 >> >> --- a/controller/local_data.h >> >> +++ b/controller/local_data.h >> >> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes( >> >> struct chassis_tunnel *chassis_tunnel_find(const struct hmap *chassis_tunnels, >> >> const char *chassis_id, >> >> char *remote_encap_ip, >> >> - char *local_encap_ip); >> >> + const char *local_encap_ip); >> > >> > >> > Same as above. >> >> >> Ack >> >> > >> > >> >> >> >> bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels, >> >> const char *chassis_name, >> >> diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml >> >> index efa65e3fd927..5ebef048d721 100644 >> >> --- a/controller/ovn-controller.8.xml >> >> +++ b/controller/ovn-controller.8.xml >> >> @@ -176,10 +176,32 @@ >> >> >> >> <dt><code>external_ids:ovn-encap-ip</code></dt> >> >> <dd> >> >> - The IP address that a chassis should use to connect to this node >> >> - using encapsulation types specified by >> >> - <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs >> >> - may be specified with a comma-separated list. >> >> + <p> >> >> + The IP address that a chassis should use to connect to this node >> >> + using encapsulation types specified by >> >> + <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs >> >> + may be specified with a comma-separated list. >> >> + </p> >> >> + <p> >> >> + In scenarios where multiple encapsulation IPs are present, distinct >> >> + tunnels are established for each remote chassis. These tunnels are >> >> + differentiated by setting unique <code>options:local_ip</code> and >> >> + <code>options:remote_ip</code> values in the tunnel interface. When >> >> + transmitting a packet to a remote chassis, the selection of local_ip >> >> + is guided by the <code>Interface:external_ids:encap-ip</code> from >> >> + the local OVSDB, corresponding to the VIF originating the packet, if >> >> + specified. The <code>Interface:external_ids:encap-ip</code> setting >> >> + of the VIF is also populated to the <code>Port_Binding</code> >> >> + table in the OVN SB database via the <code>encap</code> column. >> >> + Consequently, when a remote chassis needs to send a packet to a >> >> + port-binding associated with this VIF, it utilizes the tunnel with >> >> + the appropriate <code>options:remote_ip</code> that matches the >> >> + <code>ip</code> in <code>Port_Binding:encap</code>. This mechanism >> >> + is particularly beneficial for chassis with multiple physical >> >> + interfaces designated for tunneling, where each interface is >> >> + optimized for handling specific traffic associated with particular >> >> + VIFs. >> >> + </p> >> >> </dd> >> >> >> >> <dt><code>external_ids:ovn-encap-df_default</code></dt> >> >> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c >> >> index 856e5e270822..4ab57fe970fe 100644 >> >> --- a/controller/ovn-controller.c >> >> +++ b/controller/ovn-controller.c >> >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output { >> >> struct physical_debug debug; >> >> }; >> >> >> >> +static void >> >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, >> >> + size_t *n_encap_ips, const char * **encap_ips) >> >> +{ >> >> + const struct ovsrec_open_vswitch *cfg = >> >> + ovsrec_open_vswitch_table_first(ovs_table); >> >> + const char *chassis_id = get_ovs_chassis_id(ovs_table); >> >> + const char *encap_ips_str = >> >> + get_chassis_external_id_value(&cfg->external_ids, chassis_id, >> >> + "ovn-encap-ip", NULL); >> >> + struct sset encap_ip_set; >> >> + sset_init(&encap_ip_set); >> >> + sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); >> >> + >> >> + /* Sort the ips so that their index is deterministic. */ >> >> + *encap_ips = sset_sort(&encap_ip_set); >> >> + >> >> + /* Copy the strings so that we can destroy the sset. */ >> >> + for (size_t i = 0; (*encap_ips)[i]; i++) { >> >> + (*encap_ips)[i] = xstrdup((*encap_ips)[i]); >> >> + } >> >> + *n_encap_ips = sset_count(&encap_ip_set); >> >> + sset_destroy(&encap_ip_set); >> >> +} >> >> + >> > >> > >> > I wonder, could we store this array or maybe preferably svec in "en_non_vif_data" I-P node? This way it would be handled in the node and we could avoid the destroy calls after any pflow processing WDYT? >> >> >> Yes we could do the same in en_non_vif_data, but I think it is not really necessary and it seems more straightforward just parsing it here, because: >> 1. We don't need I-P for this ovn-encap-ip configuration change, so we don't have to persist this data. >> 2. It should be very rare to have more than 5 (or even 3) encap IPs per node, so the list should be very small and the cost of this parsing (and sset construct/destroy) is negligible. >> Using svec is also a valid option, but the sset (and array) is used here just to reuse the sset_from_delimited_string and sset_sort for convenience. I noticed that the sset_init() call can in fact be removed because sset_from_delimited_string already does that. I can remove it. >> Does this sound reasonable to you? > > > It makes sense, the main thing it would help with is the need to destroy the ctx in kinda unexpected places which might be slightly error prone. However, it being functionally the same it's fine either way. > Thanks Ales. So I will add the below small change. Would you give your Ack if it looks good to you? ----------------------- diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index 4ab57fe970fe..6873c02288c6 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -4532,7 +4532,6 @@ parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, get_chassis_external_id_value(&cfg->external_ids, chassis_id, "ovn-encap-ip", NULL); struct sset encap_ip_set; - sset_init(&encap_ip_set); sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); /* Sort the ips so that their index is deterministic. */ @@ -4615,8 +4614,6 @@ static void init_physical_ctx(struct engine_node *node, p_ctx->patch_ofports = &non_vif_data->patch_ofports; p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; - - struct controller_engine_ctx *ctrl_ctx = engine_get_context()->client_ctx; p_ctx->if_mgr = ctrl_ctx->if_mgr; ------------------------- Thanks, Han > Thanks, > Ales > >> >> >> Thanks, >> Han >> >> > >> >> >> >> static void init_physical_ctx(struct engine_node *node, >> >> struct ed_type_runtime_data *rt_data, >> >> struct ed_type_non_vif_data *non_vif_data, >> >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node *node, >> >> engine_get_input_data("ct_zones", node); >> >> struct simap *ct_zones = &ct_zones_data->current; >> >> >> >> + parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips); >> >> p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; >> >> p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath; >> >> p_ctx->port_binding_table = port_binding_table; >> >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct engine_node *node, >> >> p_ctx->patch_ofports = &non_vif_data->patch_ofports; >> >> p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; >> >> >> >> + >> >> + >> >> >> > >> > nit: Unrelated change. >> >> >> Ack >> > >> > >> >> >> >> struct controller_engine_ctx *ctrl_ctx = engine_get_context()->client_ctx; >> >> p_ctx->if_mgr = ctrl_ctx->if_mgr; >> >> >> >> pflow_output_get_debug(node, &p_ctx->debug); >> >> } >> >> >> >> +static void >> >> +destroy_physical_ctx(struct physical_ctx *p_ctx) >> >> +{ >> >> + for (size_t i = 0; i < p_ctx->n_encap_ips; i++) { >> >> + free((char *)(p_ctx->encap_ips[i])); >> >> + } >> >> + free(p_ctx->encap_ips); >> >> +} >> >> + >> >> static void * >> >> en_pflow_output_init(struct engine_node *node OVS_UNUSED, >> >> struct engine_arg *arg OVS_UNUSED) >> >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, void *data) >> >> struct physical_ctx p_ctx; >> >> init_physical_ctx(node, rt_data, non_vif_data, &p_ctx); >> >> physical_run(&p_ctx, pflow_table); >> >> + destroy_physical_ctx(&p_ctx); >> >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> } >> >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct engine_node *node, >> >> bool removed = sbrec_port_binding_is_deleted(binding); >> >> if (!physical_handle_flows_for_lport(binding, removed, &p_ctx, >> >> &pfo->flow_table)) { >> >> + destroy_physical_ctx(&p_ctx); >> >> return false; >> >> } >> >> } >> >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct engine_node *node, >> >> bool removed = sbrec_port_binding_is_deleted(pb); >> >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, >> >> &pfo->flow_table)) { >> >> + destroy_physical_ctx(&p_ctx); >> >> return false; >> >> } >> >> } >> >> engine_set_node_state(node, EN_UPDATED); >> >> } >> >> + destroy_physical_ctx(&p_ctx); >> >> return true; >> >> } >> >> >> >> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct engine_node *node, >> >> bool removed = sbrec_port_binding_is_deleted(pb); >> >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, >> >> &pfo->flow_table)) { >> >> + destroy_physical_ctx(&p_ctx); >> >> return false; >> >> } >> >> } >> >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> + destroy_physical_ctx(&p_ctx); >> >> return true; >> >> } >> >> >> >> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct engine_node *node, void *data) >> >> physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table); >> >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> + destroy_physical_ctx(&p_ctx); >> >> return true; >> >> } >> >> >> >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct engine_node *node, void *data) >> >> if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) { >> >> /* Fall back to full recompute when a local datapath >> >> * is added or deleted. */ >> >> + destroy_physical_ctx(&p_ctx); >> >> return false; >> >> } >> >> >> >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct engine_node *node, void *data) >> >> lport->tracked_type == TRACKED_RESOURCE_REMOVED ? true: false; >> >> if (!physical_handle_flows_for_lport(lport->pb, removed, &p_ctx, >> >> &pfo->flow_table)) { >> >> + destroy_physical_ctx(&p_ctx); >> >> return false; >> >> } >> >> } >> >> } >> >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> + destroy_physical_ctx(&p_ctx); >> >> return true; >> >> } >> >> >> >> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct engine_node *node, void *data) >> >> if (pb) { >> >> if (!physical_handle_flows_for_lport(pb, false, &p_ctx, >> >> &pfo->flow_table)) { >> >> + destroy_physical_ctx(&p_ctx); >> >> return false; >> >> } >> >> tag_port_as_activated_in_engine(pp); >> >> } >> >> } >> >> engine_set_node_state(node, EN_UPDATED); >> >> + destroy_physical_ctx(&p_ctx); >> >> return true; >> >> } >> >> >> >> diff --git a/controller/physical.c b/controller/physical.c >> >> index e93bfbd2cffb..c192aed751d5 100644 >> >> --- a/controller/physical.c >> >> +++ b/controller/physical.c >> >> @@ -71,6 +71,8 @@ struct tunnel { >> >> static void >> >> load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> >> const struct zone_ids *zone_ids, >> >> + size_t n_encap_ips, >> >> + const char **encap_ips, >> >> struct ofpbuf *ofpacts_p); >> >> static int64_t get_vxlan_port_key(int64_t port_key); >> >> >> >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf *ofpacts) >> >> */ >> >> static struct chassis_tunnel * >> >> get_port_binding_tun(const struct sbrec_encap *remote_encap, >> >> - const struct sbrec_encap *local_encap, >> >> const struct sbrec_chassis *chassis, >> >> - const struct hmap *chassis_tunnels) >> >> + const struct hmap *chassis_tunnels, >> >> + const char *local_encap_ip) >> >> { >> >> struct chassis_tunnel *tun = NULL; >> >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, >> >> remote_encap ? remote_encap->ip : NULL, >> >> - local_encap ? local_encap->ip : NULL); >> >> + local_encap_ip); >> >> >> >> if (!tun) { >> >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, NULL, NULL); >> >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct sbrec_port_binding *pb, >> >> static struct ovs_list * >> >> get_remote_tunnels(const struct sbrec_port_binding *binding, >> >> const struct sbrec_chassis *chassis, >> >> - const struct hmap *chassis_tunnels) >> >> + const struct hmap *chassis_tunnels, >> >> + const char *local_encap_ip) >> >> { >> >> const struct chassis_tunnel *tun; >> >> >> >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, >> >> ovs_list_init(tunnels); >> >> >> >> if (binding->chassis && binding->chassis != chassis) { >> >> - tun = get_port_binding_tun(binding->encap, NULL, binding->chassis, >> >> - chassis_tunnels); >> >> + tun = get_port_binding_tun(binding->encap, binding->chassis, >> >> + chassis_tunnels, local_encap_ip); >> >> if (!tun) { >> >> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); >> >> VLOG_WARN_RL( >> >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, >> >> } >> >> const struct sbrec_encap *additional_encap; >> >> additional_encap = find_additional_encap_for_chassis(binding, chassis); >> >> - tun = get_port_binding_tun(additional_encap, NULL, >> >> + tun = get_port_binding_tun(additional_encap, >> >> binding->additional_chassis[i], >> >> - chassis_tunnels); >> >> + chassis_tunnels, local_encap_ip); >> >> if (!tun) { >> >> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); >> >> VLOG_WARN_RL( >> >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct sbrec_port_binding *binding, >> >> struct ofpbuf *ofpacts_p, >> >> const struct sbrec_chassis *chassis, >> >> const struct hmap *chassis_tunnels, >> >> + size_t n_encap_ips, >> >> + const char **encap_ips, >> >> struct ovn_desired_flow_table *flow_table) >> >> { >> >> /* Setup encapsulation */ >> >> - struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> >> - chassis_tunnels); >> >> - if (!ovs_list_is_empty(tuns)) { >> >> - bool is_vtep_port = !strcmp(binding->type, "vtep"); >> >> - /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND >> >> - * responder in L3 networks. */ >> >> - if (is_vtep_port) { >> >> - put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_p); >> >> - } >> >> + for (size_t i = 0; i < n_encap_ips; i++) { >> >> + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); >> >> >> >> + >> >> + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16, >> >> + (uint32_t) 0xFFFF << 16); >> >> + struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> >> + chassis_tunnels, >> >> + encap_ips[i]); >> >> + if (!ovs_list_is_empty(tuns)) { >> >> + bool is_vtep_port = !strcmp(binding->type, "vtep"); >> >> + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND >> >> + * responder in L3 networks. */ >> >> + if (is_vtep_port) { >> >> + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, >> >> + ofpacts_clone); >> >> + } >> >> >> >> - struct tunnel *tun; >> >> - LIST_FOR_EACH (tun, list_node, tuns) { >> >> - put_encapsulation(mff_ovn_geneve, tun->tun, >> >> - binding->datapath, port_key, is_vtep_port, >> >> - ofpacts_p); >> >> - ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport; >> >> + struct tunnel *tun; >> >> + LIST_FOR_EACH (tun, list_node, tuns) { >> >> + put_encapsulation(mff_ovn_geneve, tun->tun, >> >> + binding->datapath, port_key, is_vtep_port, >> >> + ofpacts_clone); >> >> + ofpact_put_OUTPUT(ofpacts_clone)->port = tun->tun->ofport; >> >> + } >> >> + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); >> >> + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, >> >> + binding->header_.uuid.parts[0], match, >> >> + ofpacts_clone, &binding->header_.uuid); >> >> } >> >> - put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p); >> >> - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, >> >> - binding->header_.uuid.parts[0], match, ofpacts_p, >> >> - &binding->header_.uuid); >> >> - } >> >> - struct tunnel *tun_elem; >> >> - LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { >> >> - free(tun_elem); >> >> + struct tunnel *tun_elem; >> >> + LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { >> >> + free(tun_elem); >> >> + } >> >> + free(tuns); >> >> + >> >> + ofpbuf_delete(ofpacts_clone); >> >> } >> >> - free(tuns); >> >> } >> >> >> >> static void >> >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap *ct_zones, >> >> if (tag) { >> >> ofpact_put_STRIP_VLAN(ofpacts_p); >> >> } >> >> - load_logical_ingress_metadata(localnet_port, &zone_ids, ofpacts_p); >> >> + load_logical_ingress_metadata(localnet_port, &zone_ids, 0, NULL, >> >> + ofpacts_p); >> >> replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p); >> >> replace_mac->mac = router_port_mac; >> >> >> >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids, struct ofpbuf *ofpacts_p) >> >> { >> >> if (zone_ids) { >> >> if (zone_ids->ct) { >> >> - put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); >> >> + put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); >> >> } >> >> if (zone_ids->dnat) { >> >> put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); >> >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key, >> >> } >> >> } >> >> >> >> +static size_t >> >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips, >> >> + const char *ip) >> >> +{ >> >> + for (size_t i = 0; i < n_encap_ips; i++) { >> >> + if (!strcmp(ip, encap_ips[i])) { >> >> + return i; >> >> + } >> >> + } >> >> + return 0; >> >> +} >> >> + >> >> static void >> >> load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> >> const struct zone_ids *zone_ids, >> >> + size_t n_encap_ips, >> >> + const char **encap_ips, >> >> struct ofpbuf *ofpacts_p) >> >> { >> >> put_zones_ofpacts(zone_ids, ofpacts_p); >> >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> >> uint32_t port_key = binding->tunnel_key; >> >> put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p); >> >> put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p); >> >> + >> >> + /* Default encap_id 0. */ >> >> + size_t encap_id = 0; >> >> + if (encap_ips && binding->encap) { >> >> + encap_id = encap_ip_to_id(n_encap_ips, encap_ips, >> >> + binding->encap->ip); >> >> + } >> >> + put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p); >> >> } >> >> >> >> static const struct sbrec_port_binding * >> >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct sbrec_port_binding *binding, >> >> match_set_dl_type(&match, htons(ETH_TYPE_RARP)); >> >> match_set_in_port(&match, ofport); >> >> >> >> - load_logical_ingress_metadata(binding, zone_ids, &ofpacts); >> >> + load_logical_ingress_metadata(binding, zone_ids, 0, NULL, &ofpacts); >> >> >> >> encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP, >> >> NX_CTLR_NO_METER, &ofpacts); >> >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports( >> >> } >> >> >> >> struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> >> - chassis_tunnels); >> >> + chassis_tunnels, NULL); >> >> if (ovs_list_is_empty(tuns)) { >> >> free(tuns); >> >> return; >> >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> const struct sbrec_chassis *chassis, >> >> const struct physical_debug *debug, >> >> const struct if_status_mgr *if_mgr, >> >> + size_t n_encap_ips, >> >> + const char **encap_ips, >> >> struct ovn_desired_flow_table *flow_table, >> >> struct ofpbuf *ofpacts_p) >> >> { >> >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> ofpact_put_CT_CLEAR(ofpacts_p); >> >> put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); >> >> put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p); >> >> - put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); >> >> + put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); >> >> struct zone_ids peer_zones = get_zone_ids(peer, ct_zones); >> >> - load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p); >> >> + load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips, >> >> + encap_ips, ofpacts_p); >> >> put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p); >> >> put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p); >> >> for (int i = 0; i < MFF_N_LOG_REGS; i++) { >> >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> * as we're going to remove this with ofpbuf_pull() later. */ >> >> uint32_t ofpacts_orig_size = ofpacts_p->size; >> >> >> >> - load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p); >> >> + load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips, >> >> + encap_ips, ofpacts_p); >> >> >> >> if (!strcmp(binding->type, "localport")) { >> >> /* mark the packet as incoming from a localport */ >> >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> } else { >> >> put_remote_port_redirect_overlay( >> >> binding, mff_ovn_geneve, port_key, &match, ofpacts_p, >> >> - chassis, chassis_tunnels, flow_table); >> >> + chassis, chassis_tunnels, n_encap_ips, encap_ips, flow_table); >> >> } >> >> out: >> >> if (ha_ch_ordered) { >> >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> >> >> int zone_id = simap_get(ct_zones, port->logical_port); >> >> if (zone_id) { >> >> - put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts); >> >> + put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts); >> >> } >> >> >> >> const char *lport_name = (port->parent_port && *port->parent_port) ? >> >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx *p_ctx, >> >> p_ctx->patch_ofports, >> >> p_ctx->chassis_tunnels, >> >> pb, p_ctx->chassis, &p_ctx->debug, >> >> - p_ctx->if_mgr, flow_table, &ofpacts); >> >> + p_ctx->if_mgr, >> >> + p_ctx->n_encap_ips, >> >> + p_ctx->encap_ips, >> >> + flow_table, &ofpacts); >> >> ofpbuf_uninit(&ofpacts); >> >> } >> >> >> >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx, >> >> p_ctx->patch_ofports, >> >> p_ctx->chassis_tunnels, binding, >> >> p_ctx->chassis, &p_ctx->debug, >> >> - p_ctx->if_mgr, flow_table, &ofpacts); >> >> + p_ctx->if_mgr, >> >> + p_ctx->n_encap_ips, >> >> + p_ctx->encap_ips, >> >> + flow_table, &ofpacts); >> >> } >> >> >> >> /* Handle output to multicast groups, in tables 40 and 41. */ >> >> diff --git a/controller/physical.h b/controller/physical.h >> >> index 1f1ed55efa16..7fe8ee3c18ed 100644 >> >> --- a/controller/physical.h >> >> +++ b/controller/physical.h >> >> @@ -66,6 +66,8 @@ struct physical_ctx { >> >> struct shash *local_bindings; >> >> struct simap *patch_ofports; >> >> struct hmap *chassis_tunnels; >> >> + size_t n_encap_ips; >> >> + const char **encap_ips; >> >> struct physical_debug debug; >> >> }; >> >> >> >> diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h >> >> index 272277ec4fa0..d3455a462a0d 100644 >> >> --- a/include/ovn/logical-fields.h >> >> +++ b/include/ovn/logical-fields.h >> >> @@ -37,7 +37,9 @@ enum ovn_controller_event { >> >> #define MFF_LOG_SNAT_ZONE MFF_REG12 /* conntrack snat zone for gateway router >> >> * (32 bits). */ >> >> #define MFF_LOG_CT_ZONE MFF_REG13 /* Logical conntrack zone for lports >> >> - * (32 bits). */ >> >> + * (0..15 of the 32 bits). */ >> >> +#define MFF_LOG_ENCAP_ID MFF_REG13 /* Encap ID for lports >> >> + * (16..31 of the 32 bits). */ >> >> #define MFF_LOG_INPORT MFF_REG14 /* Logical input port (32 bits). */ >> >> #define MFF_LOG_OUTPORT MFF_REG15 /* Logical output port (32 bits). */ >> >> >> >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml >> >> index 96294fe2b795..bfd8680cedfc 100644 >> >> --- a/ovn-architecture.7.xml >> >> +++ b/ovn-architecture.7.xml >> >> @@ -1247,8 +1247,8 @@ >> >> chassis. This is initialized to 0 at the beginning of the logical >> >> <!-- Keep the following in sync with MFF_LOG_CT_ZONE in >> >> ovn/lib/logical-fields.h. --> >> >> - ingress pipeline. OVN stores this in Open vSwitch extension register >> >> - number 13. >> >> + ingress pipeline. OVN stores this in the lower 16 bits of the Open >> >> + vSwitch extension register number 13. >> >> </dd> >> >> >> >> <dt>conntrack zone fields for routers</dt> >> >> @@ -1263,6 +1263,20 @@ >> >> traffic (for SNATing) in Open vSwitch extension register number 12. >> >> </dd> >> >> >> >> + <dt>Encap ID for logical ports</dt> >> >> + <dd> >> >> + A field that records an ID that indicates which encapsulation IP should >> >> + be used when sending packets to a remote chassis, according to the >> >> + original input logical port. This is useful when there are multiple IPs >> >> + available for encapsulation. The value only has local significance and is >> >> + not meaningful between chassis. This is initialized to 0 at the beginning >> >> + of the logical >> >> + <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in >> >> + ovn/lib/logical-fields.h. --> >> >> + ingress pipeline. OVN stores this in the higher 16 bits of the Open >> >> + vSwitch extension register number 13. >> >> + </dd> >> >> + >> >> <dt>logical flow flags</dt> >> >> <dd> >> >> The logical flags are intended to handle keeping context between >> >> diff --git a/tests/ovn.at b/tests/ovn.at >> >> index 243fe0b8246c..e7f0c9681f60 100644 >> >> --- a/tests/ovn.at >> >> +++ b/tests/ovn.at >> >> @@ -30335,19 +30335,33 @@ AT_CLEANUP >> >> >> >> >> >> OVN_FOR_EACH_NORTHD([ >> >> -AT_SETUP([multiple encap ips tunnel creation]) >> >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip]) >> >> +AT_SKIP_IF([test $HAVE_SCAPY = no]) >> >> ovn_start >> >> net_add n1 >> >> >> >> +ovn-nbctl ls-add ls1 >> >> + >> >> # 2 HVs, each with 2 encap-ips. >> >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY. >> >> for i in 1 2; do >> >> sim_add hv$i >> >> as hv$i >> >> ovs-vsctl add-br br-phys-$j >> >> ovn_attach n1 br-phys-$j 192.168.0.${i}1 >> >> ovs-vsctl set open . external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 >> >> + >> >> + for j in 1 2; do >> >> + ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ >> >> + external_ids:iface-id=lsp$i$j \ >> >> + external_ids:encap-ip=192.168.0.$i$j \ >> >> + options:tx_pcap=hv$i/vif$i$j-tx.pcap options:rxq_pcap=hv$i/vif$i$j-rx.pcap >> >> + ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j "f0:00:00:00:00:$i$j 10.0.0.$i$j" >> >> + >> >> + done >> >> done >> >> >> >> +wait_for_ports_up >> >> check ovn-nbctl --wait=hv sync >> >> >> >> check_tunnel_port() { >> >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12 %192.168.0.21 >> >> check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22 >> >> check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22 >> >> >> >> +vif_to_hv() { >> >> + case $1 in dnl ( >> >> + vif[[1]]?) echo hv1 ;; dnl ( >> >> + vif[[2]]?) echo hv2 ;; dnl ( >> >> + *) AT_FAIL_IF([:]) ;; >> >> + esac >> >> +} >> >> + >> >> +# check_packet_tunnel SRC_VIF DST_VIF >> >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the corresponding >> >> +# tunnel that matches the VIFs' encap_ip configurations. >> >> +check_packet_tunnel() { >> >> + local src=$1 dst=$2 >> >> + local src_mac=f0:00:00:00:00:$src >> >> + local dst_mac=f0:00:00:00:00:$dst >> >> + local src_ip=10.0.0.$src >> >> + local dst_ip=10.0.0.$dst >> >> + local local_encap_ip=192.168.0.$src >> >> + local remote_encap_ip=192.168.0.$dst >> >> + local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ >> >> + IP(dst='${dst_ip}', src='${src_ip}')/ \ >> >> + ICMP(type=8)") >> >> + hv=`vif_to_hv vif$src` >> >> + as $hv >> >> + echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> $remote_encap_ip" >> >> + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) >> >> + AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) >> >> +} >> >> + >> >> +for i in 1 2; do >> >> + for j in 1 2; do >> >> + check_packet_tunnel 1$i 2$j >> >> + done >> >> +done >> >> + >> >> OVN_CLEANUP([hv1],[hv2]) >> >> AT_CLEANUP >> >> ]) >> >> -- >> >> 2.38.1 >> >> >> >> _______________________________________________ >> >> dev mailing list >> >> dev@openvswitch.org >> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >> >> >> > >> > Thanks, >> > Ales >> > >> > -- >> > >> > Ales Musil >> > >> > Senior Software Engineer - OVN Core >> > >> > Red Hat EMEA >> > >> > amusil@redhat.com > > > > -- > > Ales Musil > > Senior Software Engineer - OVN Core > > Red Hat EMEA > > amusil@redhat.com
On Fri, Jan 26, 2024 at 8:05 PM Han Zhou <hzhou@ovn.org> wrote: > > > On Thu, Jan 25, 2024 at 10:54 PM Ales Musil <amusil@redhat.com> wrote: > > > > > > > > On Fri, Jan 26, 2024 at 4:07 AM Han Zhou <hzhou@ovn.org> wrote: > >> > >> > >> > >> On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote: > >> > > >> > > >> > > >> > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote: > >> >> > >> >> Commit dd527a283cd8 partially supported multiple encap IPs. It > supported > >> >> remote encap IP selection based on the destination VIF's encap_ip > >> >> configuration. This patch adds the support for selecting local encap > IP > >> >> based on the source VIF's encap_ip configuration. > >> >> > >> >> Co-authored-by: Lei Huang <leih@nvidia.com> > >> >> Signed-off-by: Lei Huang <leih@nvidia.com> > >> >> Signed-off-by: Han Zhou <hzhou@ovn.org> > >> >> --- > >> > > >> > > >> > Hi Han and Lei, > >> > > >> > thank you for the patch, I have a couple of comments/questions down > below. > >> > >> > >> Thanks Ales. > >> > >> > > >> > > >> >> NEWS | 3 + > >> >> controller/chassis.c | 2 +- > >> >> controller/local_data.c | 2 +- > >> >> controller/local_data.h | 2 +- > >> >> controller/ovn-controller.8.xml | 30 ++++++- > >> >> controller/ovn-controller.c | 49 ++++++++++++ > >> >> controller/physical.c | 134 > ++++++++++++++++++++++---------- > >> >> controller/physical.h | 2 + > >> >> include/ovn/logical-fields.h | 4 +- > >> >> ovn-architecture.7.xml | 18 ++++- > >> >> tests/ovn.at | 51 +++++++++++- > >> >> 11 files changed, 243 insertions(+), 54 deletions(-) > >> >> > >> >> diff --git a/NEWS b/NEWS > >> >> index 5f267b4c64cc..5a3eed608617 100644 > >> >> --- a/NEWS > >> >> +++ b/NEWS > >> >> @@ -14,6 +14,9 @@ Post v23.09.0 > >> >> - ovn-northd-ddlog has been removed. > >> >> - A new LSP option "enable_router_port_acl" has been added to > enable > >> >> conntrack for the router port whose peer is l3dgw_port if set > it true. > >> >> + - Support selecting encapsulation IP based on the > source/destination VIF's > >> >> + settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for > more > >> >> + details. > >> >> > >> >> OVN v23.09.0 - 15 Sep 2023 > >> >> -------------------------- > >> >> diff --git a/controller/chassis.c b/controller/chassis.c > >> >> index a6f13ccc42d5..55f2beb37674 100644 > >> >> --- a/controller/chassis.c > >> >> +++ b/controller/chassis.c > >> >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg { > >> >> > >> >> /* Set of encap types parsed from the 'ovn-encap-type' > external-id. */ > >> >> struct sset encap_type_set; > >> >> - /* Set of encap IPs parsed from the 'ovn-encap-type' > external-id. */ > >> >> + /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. > */ > >> >> struct sset encap_ip_set; > >> >> /* Interface type list formatted in the OVN-SB Chassis required > format. */ > >> >> struct ds iface_types; > >> >> diff --git a/controller/local_data.c b/controller/local_data.c > >> >> index a9092783958f..8606414f8728 100644 > >> >> --- a/controller/local_data.c > >> >> +++ b/controller/local_data.c > >> >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap > *chassis_tunnels) > >> >> */ > >> >> struct chassis_tunnel * > >> >> chassis_tunnel_find(const struct hmap *chassis_tunnels, const char > *chassis_id, > >> >> - char *remote_encap_ip, char *local_encap_ip) > >> >> + char *remote_encap_ip, const char > *local_encap_ip) > >> > > >> > > >> > nit: Unrelated change. > >> > >> > >> Ack > > Hi Ales, sorry that I just realized this change is related, which is > because of the const char * array introduced in this patch that stores the > parsed encap_ips, and it makes sense to use const char * since we should > never expect this string to be modified in the function. > > >> > >> > > >> > > >> >> { > >> >> /* > >> >> * If the specific encap_ip is given, look for the chassisid_ip > entry, > >> >> diff --git a/controller/local_data.h b/controller/local_data.h > >> >> index bab95bcc3824..ca3905bd20e6 100644 > >> >> --- a/controller/local_data.h > >> >> +++ b/controller/local_data.h > >> >> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes( > >> >> struct chassis_tunnel *chassis_tunnel_find(const struct hmap > *chassis_tunnels, > >> >> const char *chassis_id, > >> >> char *remote_encap_ip, > >> >> - char *local_encap_ip); > >> >> + const char > *local_encap_ip); > >> > > >> > > >> > Same as above. > >> > >> > >> Ack > >> > >> > > >> > > >> >> > >> >> bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels, > >> >> const char *chassis_name, > >> >> diff --git a/controller/ovn-controller.8.xml > b/controller/ovn-controller.8.xml > >> >> index efa65e3fd927..5ebef048d721 100644 > >> >> --- a/controller/ovn-controller.8.xml > >> >> +++ b/controller/ovn-controller.8.xml > >> >> @@ -176,10 +176,32 @@ > >> >> > >> >> <dt><code>external_ids:ovn-encap-ip</code></dt> > >> >> <dd> > >> >> - The IP address that a chassis should use to connect to this > node > >> >> - using encapsulation types specified by > >> >> - <code>external_ids:ovn-encap-type</code>. Multiple > encapsulation IPs > >> >> - may be specified with a comma-separated list. > >> >> + <p> > >> >> + The IP address that a chassis should use to connect to > this node > >> >> + using encapsulation types specified by > >> >> + <code>external_ids:ovn-encap-type</code>. Multiple > encapsulation IPs > >> >> + may be specified with a comma-separated list. > >> >> + </p> > >> >> + <p> > >> >> + In scenarios where multiple encapsulation IPs are > present, distinct > >> >> + tunnels are established for each remote chassis. These > tunnels are > >> >> + differentiated by setting unique > <code>options:local_ip</code> and > >> >> + <code>options:remote_ip</code> values in the tunnel > interface. When > >> >> + transmitting a packet to a remote chassis, the selection > of local_ip > >> >> + is guided by the > <code>Interface:external_ids:encap-ip</code> from > >> >> + the local OVSDB, corresponding to the VIF originating the > packet, if > >> >> + specified. The > <code>Interface:external_ids:encap-ip</code> setting > >> >> + of the VIF is also populated to the > <code>Port_Binding</code> > >> >> + table in the OVN SB database via the <code>encap</code> > column. > >> >> + Consequently, when a remote chassis needs to send a > packet to a > >> >> + port-binding associated with this VIF, it utilizes the > tunnel with > >> >> + the appropriate <code>options:remote_ip</code> that > matches the > >> >> + <code>ip</code> in <code>Port_Binding:encap</code>. This > mechanism > >> >> + is particularly beneficial for chassis with multiple > physical > >> >> + interfaces designated for tunneling, where each interface > is > >> >> + optimized for handling specific traffic associated with > particular > >> >> + VIFs. > >> >> + </p> > >> >> </dd> > >> >> > >> >> <dt><code>external_ids:ovn-encap-df_default</code></dt> > >> >> diff --git a/controller/ovn-controller.c > b/controller/ovn-controller.c > >> >> index 856e5e270822..4ab57fe970fe 100644 > >> >> --- a/controller/ovn-controller.c > >> >> +++ b/controller/ovn-controller.c > >> >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output { > >> >> struct physical_debug debug; > >> >> }; > >> >> > >> >> +static void > >> >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, > >> >> + size_t *n_encap_ips, const char * **encap_ips) > >> >> +{ > >> >> + const struct ovsrec_open_vswitch *cfg = > >> >> + ovsrec_open_vswitch_table_first(ovs_table); > >> >> + const char *chassis_id = get_ovs_chassis_id(ovs_table); > >> >> + const char *encap_ips_str = > >> >> + get_chassis_external_id_value(&cfg->external_ids, > chassis_id, > >> >> + "ovn-encap-ip", NULL); > >> >> + struct sset encap_ip_set; > >> >> + sset_init(&encap_ip_set); > >> >> + sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); > >> >> + > >> >> + /* Sort the ips so that their index is deterministic. */ > >> >> + *encap_ips = sset_sort(&encap_ip_set); > >> >> + > >> >> + /* Copy the strings so that we can destroy the sset. */ > >> >> + for (size_t i = 0; (*encap_ips)[i]; i++) { > >> >> + (*encap_ips)[i] = xstrdup((*encap_ips)[i]); > >> >> + } > >> >> + *n_encap_ips = sset_count(&encap_ip_set); > >> >> + sset_destroy(&encap_ip_set); > >> >> +} > >> >> + > >> > > >> > > >> > I wonder, could we store this array or maybe preferably svec in > "en_non_vif_data" I-P node? This way it would be handled in the node and we > could avoid the destroy calls after any pflow processing WDYT? > >> > >> > >> Yes we could do the same in en_non_vif_data, but I think it is not > really necessary and it seems more straightforward just parsing it here, > because: > >> 1. We don't need I-P for this ovn-encap-ip configuration change, so we > don't have to persist this data. > >> 2. It should be very rare to have more than 5 (or even 3) encap IPs per > node, so the list should be very small and the cost of this parsing (and > sset construct/destroy) is negligible. > >> Using svec is also a valid option, but the sset (and array) is used > here just to reuse the sset_from_delimited_string and sset_sort for > convenience. I noticed that the sset_init() call can in fact be removed > because sset_from_delimited_string already does that. I can remove it. > >> Does this sound reasonable to you? > > > > > > It makes sense, the main thing it would help with is the need to destroy > the ctx in kinda unexpected places which might be slightly error prone. > However, it being functionally the same it's fine either way. > > > > Thanks Ales. So I will add the below small change. Would you give your Ack > if it looks good to you? > ----------------------- > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > index 4ab57fe970fe..6873c02288c6 100644 > --- a/controller/ovn-controller.c > +++ b/controller/ovn-controller.c > @@ -4532,7 +4532,6 @@ parse_encap_ips(const struct > ovsrec_open_vswitch_table *ovs_table, > get_chassis_external_id_value(&cfg->external_ids, chassis_id, > "ovn-encap-ip", NULL); > struct sset encap_ip_set; > - sset_init(&encap_ip_set); > sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); > > /* Sort the ips so that their index is deterministic. */ > @@ -4615,8 +4614,6 @@ static void init_physical_ctx(struct engine_node > *node, > p_ctx->patch_ofports = &non_vif_data->patch_ofports; > p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; > > - > - > struct controller_engine_ctx *ctrl_ctx = > engine_get_context()->client_ctx; > p_ctx->if_mgr = ctrl_ctx->if_mgr; > ------------------------- > > Thanks, > Han > Yeah the diff looks ok. Acked-by: Ales Musil <amusil@redhat.com> Thanks, Ales > > > Thanks, > > Ales > > > >> > >> > >> Thanks, > >> Han > >> > >> > > >> >> > >> >> static void init_physical_ctx(struct engine_node *node, > >> >> struct ed_type_runtime_data *rt_data, > >> >> struct ed_type_non_vif_data > *non_vif_data, > >> >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct > engine_node *node, > >> >> engine_get_input_data("ct_zones", node); > >> >> struct simap *ct_zones = &ct_zones_data->current; > >> >> > >> >> + parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, > &p_ctx->encap_ips); > >> >> p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; > >> >> p_ctx->sbrec_port_binding_by_datapath = > sbrec_port_binding_by_datapath; > >> >> p_ctx->port_binding_table = port_binding_table; > >> >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct > engine_node *node, > >> >> p_ctx->patch_ofports = &non_vif_data->patch_ofports; > >> >> p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; > >> >> > >> >> + > >> >> + > >> >> > >> > > >> > nit: Unrelated change. > >> > >> > >> Ack > >> > > >> > > >> >> > >> >> struct controller_engine_ctx *ctrl_ctx = > engine_get_context()->client_ctx; > >> >> p_ctx->if_mgr = ctrl_ctx->if_mgr; > >> >> > >> >> pflow_output_get_debug(node, &p_ctx->debug); > >> >> } > >> >> > >> >> +static void > >> >> +destroy_physical_ctx(struct physical_ctx *p_ctx) > >> >> +{ > >> >> + for (size_t i = 0; i < p_ctx->n_encap_ips; i++) { > >> >> + free((char *)(p_ctx->encap_ips[i])); > >> >> + } > >> >> + free(p_ctx->encap_ips); > >> >> +} > >> >> + > >> >> static void * > >> >> en_pflow_output_init(struct engine_node *node OVS_UNUSED, > >> >> struct engine_arg *arg OVS_UNUSED) > >> >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, > void *data) > >> >> struct physical_ctx p_ctx; > >> >> init_physical_ctx(node, rt_data, non_vif_data, &p_ctx); > >> >> physical_run(&p_ctx, pflow_table); > >> >> + destroy_physical_ctx(&p_ctx); > >> >> > >> >> engine_set_node_state(node, EN_UPDATED); > >> >> } > >> >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct > engine_node *node, > >> >> bool removed = > sbrec_port_binding_is_deleted(binding); > >> >> if (!physical_handle_flows_for_lport(binding, > removed, &p_ctx, > >> >> > &pfo->flow_table)) { > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return false; > >> >> } > >> >> } > >> >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct > engine_node *node, > >> >> bool removed = sbrec_port_binding_is_deleted(pb); > >> >> if (!physical_handle_flows_for_lport(pb, removed, > &p_ctx, > >> >> &pfo->flow_table)) > { > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return false; > >> >> } > >> >> } > >> >> engine_set_node_state(node, EN_UPDATED); > >> >> } > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return true; > >> >> } > >> >> > >> >> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct > engine_node *node, > >> >> bool removed = sbrec_port_binding_is_deleted(pb); > >> >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, > >> >> &pfo->flow_table)) { > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return false; > >> >> } > >> >> } > >> >> > >> >> engine_set_node_state(node, EN_UPDATED); > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return true; > >> >> } > >> >> > >> >> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct > engine_node *node, void *data) > >> >> physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table); > >> >> > >> >> engine_set_node_state(node, EN_UPDATED); > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return true; > >> >> } > >> >> > >> >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct > engine_node *node, void *data) > >> >> if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) { > >> >> /* Fall back to full recompute when a local datapath > >> >> * is added or deleted. */ > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return false; > >> >> } > >> >> > >> >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct > engine_node *node, void *data) > >> >> lport->tracked_type == TRACKED_RESOURCE_REMOVED ? > true: false; > >> >> if (!physical_handle_flows_for_lport(lport->pb, > removed, &p_ctx, > >> >> &pfo->flow_table)) > { > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return false; > >> >> } > >> >> } > >> >> } > >> >> > >> >> engine_set_node_state(node, EN_UPDATED); > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return true; > >> >> } > >> >> > >> >> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct > engine_node *node, void *data) > >> >> if (pb) { > >> >> if (!physical_handle_flows_for_lport(pb, false, &p_ctx, > >> >> &pfo->flow_table)) > { > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return false; > >> >> } > >> >> tag_port_as_activated_in_engine(pp); > >> >> } > >> >> } > >> >> engine_set_node_state(node, EN_UPDATED); > >> >> + destroy_physical_ctx(&p_ctx); > >> >> return true; > >> >> } > >> >> > >> >> diff --git a/controller/physical.c b/controller/physical.c > >> >> index e93bfbd2cffb..c192aed751d5 100644 > >> >> --- a/controller/physical.c > >> >> +++ b/controller/physical.c > >> >> @@ -71,6 +71,8 @@ struct tunnel { > >> >> static void > >> >> load_logical_ingress_metadata(const struct sbrec_port_binding > *binding, > >> >> const struct zone_ids *zone_ids, > >> >> + size_t n_encap_ips, > >> >> + const char **encap_ips, > >> >> struct ofpbuf *ofpacts_p); > >> >> static int64_t get_vxlan_port_key(int64_t port_key); > >> >> > >> >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf > *ofpacts) > >> >> */ > >> >> static struct chassis_tunnel * > >> >> get_port_binding_tun(const struct sbrec_encap *remote_encap, > >> >> - const struct sbrec_encap *local_encap, > >> >> const struct sbrec_chassis *chassis, > >> >> - const struct hmap *chassis_tunnels) > >> >> + const struct hmap *chassis_tunnels, > >> >> + const char *local_encap_ip) > >> >> { > >> >> struct chassis_tunnel *tun = NULL; > >> >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, > >> >> remote_encap ? remote_encap->ip : > NULL, > >> >> - local_encap ? local_encap->ip : NULL); > >> >> + local_encap_ip); > >> >> > >> >> if (!tun) { > >> >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, > NULL, NULL); > >> >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct > sbrec_port_binding *pb, > >> >> static struct ovs_list * > >> >> get_remote_tunnels(const struct sbrec_port_binding *binding, > >> >> const struct sbrec_chassis *chassis, > >> >> - const struct hmap *chassis_tunnels) > >> >> + const struct hmap *chassis_tunnels, > >> >> + const char *local_encap_ip) > >> >> { > >> >> const struct chassis_tunnel *tun; > >> >> > >> >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct > sbrec_port_binding *binding, > >> >> ovs_list_init(tunnels); > >> >> > >> >> if (binding->chassis && binding->chassis != chassis) { > >> >> - tun = get_port_binding_tun(binding->encap, NULL, > binding->chassis, > >> >> - chassis_tunnels); > >> >> + tun = get_port_binding_tun(binding->encap, binding->chassis, > >> >> + chassis_tunnels, local_encap_ip); > >> >> if (!tun) { > >> >> static struct vlog_rate_limit rl = > VLOG_RATE_LIMIT_INIT(5, 1); > >> >> VLOG_WARN_RL( > >> >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct > sbrec_port_binding *binding, > >> >> } > >> >> const struct sbrec_encap *additional_encap; > >> >> additional_encap = > find_additional_encap_for_chassis(binding, chassis); > >> >> - tun = get_port_binding_tun(additional_encap, NULL, > >> >> + tun = get_port_binding_tun(additional_encap, > >> >> binding->additional_chassis[i], > >> >> - chassis_tunnels); > >> >> + chassis_tunnels, local_encap_ip); > >> >> if (!tun) { > >> >> static struct vlog_rate_limit rl = > VLOG_RATE_LIMIT_INIT(5, 1); > >> >> VLOG_WARN_RL( > >> >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct > sbrec_port_binding *binding, > >> >> struct ofpbuf *ofpacts_p, > >> >> const struct sbrec_chassis > *chassis, > >> >> const struct hmap *chassis_tunnels, > >> >> + size_t n_encap_ips, > >> >> + const char **encap_ips, > >> >> struct ovn_desired_flow_table > *flow_table) > >> >> { > >> >> /* Setup encapsulation */ > >> >> - struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > >> >> - chassis_tunnels); > >> >> - if (!ovs_list_is_empty(tuns)) { > >> >> - bool is_vtep_port = !strcmp(binding->type, "vtep"); > >> >> - /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check > for ARP/ND > >> >> - * responder in L3 networks. */ > >> >> - if (is_vtep_port) { > >> >> - put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, > ofpacts_p); > >> >> - } > >> >> + for (size_t i = 0; i < n_encap_ips; i++) { > >> >> + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); > >> >> > >> >> + > >> >> + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i > << 16, > >> >> + (uint32_t) 0xFFFF << 16); > >> >> + struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > >> >> + chassis_tunnels, > >> >> + encap_ips[i]); > >> >> + if (!ovs_list_is_empty(tuns)) { > >> >> + bool is_vtep_port = !strcmp(binding->type, "vtep"); > >> >> + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback > check for ARP/ND > >> >> + * responder in L3 networks. */ > >> >> + if (is_vtep_port) { > >> >> + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, > >> >> + ofpacts_clone); > >> >> + } > >> >> > >> >> - struct tunnel *tun; > >> >> - LIST_FOR_EACH (tun, list_node, tuns) { > >> >> - put_encapsulation(mff_ovn_geneve, tun->tun, > >> >> - binding->datapath, port_key, > is_vtep_port, > >> >> - ofpacts_p); > >> >> - ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport; > >> >> + struct tunnel *tun; > >> >> + LIST_FOR_EACH (tun, list_node, tuns) { > >> >> + put_encapsulation(mff_ovn_geneve, tun->tun, > >> >> + binding->datapath, port_key, > is_vtep_port, > >> >> + ofpacts_clone); > >> >> + ofpact_put_OUTPUT(ofpacts_clone)->port = > tun->tun->ofport; > >> >> + } > >> >> + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); > >> >> + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, > >> >> + binding->header_.uuid.parts[0], match, > >> >> + ofpacts_clone, &binding->header_.uuid); > >> >> } > >> >> - put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p); > >> >> - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, > >> >> - binding->header_.uuid.parts[0], match, > ofpacts_p, > >> >> - &binding->header_.uuid); > >> >> - } > >> >> - struct tunnel *tun_elem; > >> >> - LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { > >> >> - free(tun_elem); > >> >> + struct tunnel *tun_elem; > >> >> + LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { > >> >> + free(tun_elem); > >> >> + } > >> >> + free(tuns); > >> >> + > >> >> + ofpbuf_delete(ofpacts_clone); > >> >> } > >> >> - free(tuns); > >> >> } > >> >> > >> >> static void > >> >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap > *ct_zones, > >> >> if (tag) { > >> >> ofpact_put_STRIP_VLAN(ofpacts_p); > >> >> } > >> >> - load_logical_ingress_metadata(localnet_port, &zone_ids, > ofpacts_p); > >> >> + load_logical_ingress_metadata(localnet_port, &zone_ids, 0, > NULL, > >> >> + ofpacts_p); > >> >> replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p); > >> >> replace_mac->mac = router_port_mac; > >> >> > >> >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids > *zone_ids, struct ofpbuf *ofpacts_p) > >> >> { > >> >> if (zone_ids) { > >> >> if (zone_ids->ct) { > >> >> - put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, > ofpacts_p); > >> >> + put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, > ofpacts_p); > >> >> } > >> >> if (zone_ids->dnat) { > >> >> put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, > ofpacts_p); > >> >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key, > >> >> } > >> >> } > >> >> > >> >> +static size_t > >> >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips, > >> >> + const char *ip) > >> >> +{ > >> >> + for (size_t i = 0; i < n_encap_ips; i++) { > >> >> + if (!strcmp(ip, encap_ips[i])) { > >> >> + return i; > >> >> + } > >> >> + } > >> >> + return 0; > >> >> +} > >> >> + > >> >> static void > >> >> load_logical_ingress_metadata(const struct sbrec_port_binding > *binding, > >> >> const struct zone_ids *zone_ids, > >> >> + size_t n_encap_ips, > >> >> + const char **encap_ips, > >> >> struct ofpbuf *ofpacts_p) > >> >> { > >> >> put_zones_ofpacts(zone_ids, ofpacts_p); > >> >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct > sbrec_port_binding *binding, > >> >> uint32_t port_key = binding->tunnel_key; > >> >> put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p); > >> >> put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p); > >> >> + > >> >> + /* Default encap_id 0. */ > >> >> + size_t encap_id = 0; > >> >> + if (encap_ips && binding->encap) { > >> >> + encap_id = encap_ip_to_id(n_encap_ips, encap_ips, > >> >> + binding->encap->ip); > >> >> + } > >> >> + put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p); > >> >> } > >> >> > >> >> static const struct sbrec_port_binding * > >> >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct > sbrec_port_binding *binding, > >> >> match_set_dl_type(&match, htons(ETH_TYPE_RARP)); > >> >> match_set_in_port(&match, ofport); > >> >> > >> >> - load_logical_ingress_metadata(binding, zone_ids, &ofpacts); > >> >> + load_logical_ingress_metadata(binding, zone_ids, 0, NULL, > &ofpacts); > >> >> > >> >> encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP, > >> >> NX_CTLR_NO_METER, &ofpacts); > >> >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports( > >> >> } > >> >> > >> >> struct ovs_list *tuns = get_remote_tunnels(binding, chassis, > >> >> - chassis_tunnels); > >> >> + chassis_tunnels, > NULL); > >> >> if (ovs_list_is_empty(tuns)) { > >> >> free(tuns); > >> >> return; > >> >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> >> const struct sbrec_chassis *chassis, > >> >> const struct physical_debug *debug, > >> >> const struct if_status_mgr *if_mgr, > >> >> + size_t n_encap_ips, > >> >> + const char **encap_ips, > >> >> struct ovn_desired_flow_table *flow_table, > >> >> struct ofpbuf *ofpacts_p) > >> >> { > >> >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> >> ofpact_put_CT_CLEAR(ofpacts_p); > >> >> put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); > >> >> put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p); > >> >> - put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); > >> >> + put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); > >> >> struct zone_ids peer_zones = get_zone_ids(peer, ct_zones); > >> >> - load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p); > >> >> + load_logical_ingress_metadata(peer, &peer_zones, > n_encap_ips, > >> >> + encap_ips, ofpacts_p); > >> >> put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p); > >> >> put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p); > >> >> for (int i = 0; i < MFF_N_LOG_REGS; i++) { > >> >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> >> * as we're going to remove this with ofpbuf_pull() later. > */ > >> >> uint32_t ofpacts_orig_size = ofpacts_p->size; > >> >> > >> >> - load_logical_ingress_metadata(binding, &zone_ids, > ofpacts_p); > >> >> + load_logical_ingress_metadata(binding, &zone_ids, > n_encap_ips, > >> >> + encap_ips, ofpacts_p); > >> >> > >> >> if (!strcmp(binding->type, "localport")) { > >> >> /* mark the packet as incoming from a localport */ > >> >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> >> } else { > >> >> put_remote_port_redirect_overlay( > >> >> binding, mff_ovn_geneve, port_key, &match, ofpacts_p, > >> >> - chassis, chassis_tunnels, flow_table); > >> >> + chassis, chassis_tunnels, n_encap_ips, encap_ips, > flow_table); > >> >> } > >> >> out: > >> >> if (ha_ch_ordered) { > >> >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > >> >> > >> >> int zone_id = simap_get(ct_zones, port->logical_port); > >> >> if (zone_id) { > >> >> - put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts); > >> >> + put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts); > >> >> } > >> >> > >> >> const char *lport_name = (port->parent_port && > *port->parent_port) ? > >> >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct > physical_ctx *p_ctx, > >> >> p_ctx->patch_ofports, > >> >> p_ctx->chassis_tunnels, > >> >> pb, p_ctx->chassis, &p_ctx->debug, > >> >> - p_ctx->if_mgr, flow_table, &ofpacts); > >> >> + p_ctx->if_mgr, > >> >> + p_ctx->n_encap_ips, > >> >> + p_ctx->encap_ips, > >> >> + flow_table, &ofpacts); > >> >> ofpbuf_uninit(&ofpacts); > >> >> } > >> >> > >> >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx, > >> >> p_ctx->patch_ofports, > >> >> p_ctx->chassis_tunnels, binding, > >> >> p_ctx->chassis, &p_ctx->debug, > >> >> - p_ctx->if_mgr, flow_table, &ofpacts); > >> >> + p_ctx->if_mgr, > >> >> + p_ctx->n_encap_ips, > >> >> + p_ctx->encap_ips, > >> >> + flow_table, &ofpacts); > >> >> } > >> >> > >> >> /* Handle output to multicast groups, in tables 40 and 41. */ > >> >> diff --git a/controller/physical.h b/controller/physical.h > >> >> index 1f1ed55efa16..7fe8ee3c18ed 100644 > >> >> --- a/controller/physical.h > >> >> +++ b/controller/physical.h > >> >> @@ -66,6 +66,8 @@ struct physical_ctx { > >> >> struct shash *local_bindings; > >> >> struct simap *patch_ofports; > >> >> struct hmap *chassis_tunnels; > >> >> + size_t n_encap_ips; > >> >> + const char **encap_ips; > >> >> struct physical_debug debug; > >> >> }; > >> >> > >> >> diff --git a/include/ovn/logical-fields.h > b/include/ovn/logical-fields.h > >> >> index 272277ec4fa0..d3455a462a0d 100644 > >> >> --- a/include/ovn/logical-fields.h > >> >> +++ b/include/ovn/logical-fields.h > >> >> @@ -37,7 +37,9 @@ enum ovn_controller_event { > >> >> #define MFF_LOG_SNAT_ZONE MFF_REG12 /* conntrack snat zone for > gateway router > >> >> * (32 bits). */ > >> >> #define MFF_LOG_CT_ZONE MFF_REG13 /* Logical conntrack zone for > lports > >> >> - * (32 bits). */ > >> >> + * (0..15 of the 32 bits). */ > >> >> +#define MFF_LOG_ENCAP_ID MFF_REG13 /* Encap ID for lports > >> >> + * (16..31 of the 32 bits). */ > >> >> #define MFF_LOG_INPORT MFF_REG14 /* Logical input port (32 > bits). */ > >> >> #define MFF_LOG_OUTPORT MFF_REG15 /* Logical output port (32 > bits). */ > >> >> > >> >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml > >> >> index 96294fe2b795..bfd8680cedfc 100644 > >> >> --- a/ovn-architecture.7.xml > >> >> +++ b/ovn-architecture.7.xml > >> >> @@ -1247,8 +1247,8 @@ > >> >> chassis. This is initialized to 0 at the beginning of the > logical > >> >> <!-- Keep the following in sync with MFF_LOG_CT_ZONE in > >> >> ovn/lib/logical-fields.h. --> > >> >> - ingress pipeline. OVN stores this in Open vSwitch extension > register > >> >> - number 13. > >> >> + ingress pipeline. OVN stores this in the lower 16 bits of > the Open > >> >> + vSwitch extension register number 13. > >> >> </dd> > >> >> > >> >> <dt>conntrack zone fields for routers</dt> > >> >> @@ -1263,6 +1263,20 @@ > >> >> traffic (for SNATing) in Open vSwitch extension register > number 12. > >> >> </dd> > >> >> > >> >> + <dt>Encap ID for logical ports</dt> > >> >> + <dd> > >> >> + A field that records an ID that indicates which encapsulation > IP should > >> >> + be used when sending packets to a remote chassis, according > to the > >> >> + original input logical port. This is useful when there are > multiple IPs > >> >> + available for encapsulation. The value only has local > significance and is > >> >> + not meaningful between chassis. This is initialized to 0 at > the beginning > >> >> + of the logical > >> >> + <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in > >> >> + ovn/lib/logical-fields.h. --> > >> >> + ingress pipeline. OVN stores this in the higher 16 bits of > the Open > >> >> + vSwitch extension register number 13. > >> >> + </dd> > >> >> + > >> >> <dt>logical flow flags</dt> > >> >> <dd> > >> >> The logical flags are intended to handle keeping context > between > >> >> diff --git a/tests/ovn.at b/tests/ovn.at > >> >> index 243fe0b8246c..e7f0c9681f60 100644 > >> >> --- a/tests/ovn.at > >> >> +++ b/tests/ovn.at > >> >> @@ -30335,19 +30335,33 @@ AT_CLEANUP > >> >> > >> >> > >> >> OVN_FOR_EACH_NORTHD([ > >> >> -AT_SETUP([multiple encap ips tunnel creation]) > >> >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip]) > >> >> +AT_SKIP_IF([test $HAVE_SCAPY = no]) > >> >> ovn_start > >> >> net_add n1 > >> >> > >> >> +ovn-nbctl ls-add ls1 > >> >> + > >> >> # 2 HVs, each with 2 encap-ips. > >> >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY. > >> >> for i in 1 2; do > >> >> sim_add hv$i > >> >> as hv$i > >> >> ovs-vsctl add-br br-phys-$j > >> >> ovn_attach n1 br-phys-$j 192.168.0.${i}1 > >> >> ovs-vsctl set open . > external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 > >> >> + > >> >> + for j in 1 2; do > >> >> + ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ > >> >> + external_ids:iface-id=lsp$i$j \ > >> >> + external_ids:encap-ip=192.168.0.$i$j \ > >> >> + options:tx_pcap=hv$i/vif$i$j-tx.pcap > options:rxq_pcap=hv$i/vif$i$j-rx.pcap > >> >> + ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j > "f0:00:00:00:00:$i$j 10.0.0.$i$j" > >> >> + > >> >> + done > >> >> done > >> >> > >> >> +wait_for_ports_up > >> >> check ovn-nbctl --wait=hv sync > >> >> > >> >> check_tunnel_port() { > >> >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int > hv1@192.168.0.12%192.168.0.21 > >> >> check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22 > >> >> check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22 > >> >> > >> >> +vif_to_hv() { > >> >> + case $1 in dnl ( > >> >> + vif[[1]]?) echo hv1 ;; dnl ( > >> >> + vif[[2]]?) echo hv2 ;; dnl ( > >> >> + *) AT_FAIL_IF([:]) ;; > >> >> + esac > >> >> +} > >> >> + > >> >> +# check_packet_tunnel SRC_VIF DST_VIF > >> >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the > corresponding > >> >> +# tunnel that matches the VIFs' encap_ip configurations. > >> >> +check_packet_tunnel() { > >> >> + local src=$1 dst=$2 > >> >> + local src_mac=f0:00:00:00:00:$src > >> >> + local dst_mac=f0:00:00:00:00:$dst > >> >> + local src_ip=10.0.0.$src > >> >> + local dst_ip=10.0.0.$dst > >> >> + local local_encap_ip=192.168.0.$src > >> >> + local remote_encap_ip=192.168.0.$dst > >> >> + local packet=$(fmt_pkt "Ether(dst='${dst_mac}', > src='${src_mac}')/ \ > >> >> + IP(dst='${dst_ip}', src='${src_ip}')/ \ > >> >> + ICMP(type=8)") > >> >> + hv=`vif_to_hv vif$src` > >> >> + as $hv > >> >> + echo "vif$src -> vif$dst should go through tunnel > $local_encap_ip -> $remote_encap_ip" > >> >> + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > >> >> + AT_CHECK([test $(ovs-appctl ofproto/trace br-int > in_port=vif$src $packet | grep "output\:" | awk -F ':' '{ print $2 }') == > $tunnel_ofport]) > >> >> +} > >> >> + > >> >> +for i in 1 2; do > >> >> + for j in 1 2; do > >> >> + check_packet_tunnel 1$i 2$j > >> >> + done > >> >> +done > >> >> + > >> >> OVN_CLEANUP([hv1],[hv2]) > >> >> AT_CLEANUP > >> >> ]) > >> >> -- > >> >> 2.38.1 > >> >> > >> >> _______________________________________________ > >> >> dev mailing list > >> >> dev@openvswitch.org > >> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev > >> >> > >> > > >> > Thanks, > >> > Ales > >> > > >> > -- > >> > > >> > Ales Musil > >> > > >> > Senior Software Engineer - OVN Core > >> > > >> > Red Hat EMEA > >> > > >> > amusil@redhat.com > > > > > > > > -- > > > > Ales Musil > > > > Senior Software Engineer - OVN Core > > > > Red Hat EMEA > > > > amusil@redhat.com >
On Mon, Jan 29, 2024 at 2:41 AM Ales Musil <amusil@redhat.com> wrote: > > > > On Fri, Jan 26, 2024 at 8:05 PM Han Zhou <hzhou@ovn.org> wrote: >> >> >> >> On Thu, Jan 25, 2024 at 10:54 PM Ales Musil <amusil@redhat.com> wrote: >> > >> > >> > >> > On Fri, Jan 26, 2024 at 4:07 AM Han Zhou <hzhou@ovn.org> wrote: >> >> >> >> >> >> >> >> On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote: >> >> > >> >> > >> >> > >> >> > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote: >> >> >> >> >> >> Commit dd527a283cd8 partially supported multiple encap IPs. It supported >> >> >> remote encap IP selection based on the destination VIF's encap_ip >> >> >> configuration. This patch adds the support for selecting local encap IP >> >> >> based on the source VIF's encap_ip configuration. >> >> >> >> >> >> Co-authored-by: Lei Huang <leih@nvidia.com> >> >> >> Signed-off-by: Lei Huang <leih@nvidia.com> >> >> >> Signed-off-by: Han Zhou <hzhou@ovn.org> >> >> >> --- >> >> > >> >> > >> >> > Hi Han and Lei, >> >> > >> >> > thank you for the patch, I have a couple of comments/questions down below. >> >> >> >> >> >> Thanks Ales. >> >> >> >> > >> >> > >> >> >> NEWS | 3 + >> >> >> controller/chassis.c | 2 +- >> >> >> controller/local_data.c | 2 +- >> >> >> controller/local_data.h | 2 +- >> >> >> controller/ovn-controller.8.xml | 30 ++++++- >> >> >> controller/ovn-controller.c | 49 ++++++++++++ >> >> >> controller/physical.c | 134 ++++++++++++++++++++++---------- >> >> >> controller/physical.h | 2 + >> >> >> include/ovn/logical-fields.h | 4 +- >> >> >> ovn-architecture.7.xml | 18 ++++- >> >> >> tests/ovn.at | 51 +++++++++++- >> >> >> 11 files changed, 243 insertions(+), 54 deletions(-) >> >> >> >> >> >> diff --git a/NEWS b/NEWS >> >> >> index 5f267b4c64cc..5a3eed608617 100644 >> >> >> --- a/NEWS >> >> >> +++ b/NEWS >> >> >> @@ -14,6 +14,9 @@ Post v23.09.0 >> >> >> - ovn-northd-ddlog has been removed. >> >> >> - A new LSP option "enable_router_port_acl" has been added to enable >> >> >> conntrack for the router port whose peer is l3dgw_port if set it true. >> >> >> + - Support selecting encapsulation IP based on the source/destination VIF's >> >> >> + settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for more >> >> >> + details. >> >> >> >> >> >> OVN v23.09.0 - 15 Sep 2023 >> >> >> -------------------------- >> >> >> diff --git a/controller/chassis.c b/controller/chassis.c >> >> >> index a6f13ccc42d5..55f2beb37674 100644 >> >> >> --- a/controller/chassis.c >> >> >> +++ b/controller/chassis.c >> >> >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg { >> >> >> >> >> >> /* Set of encap types parsed from the 'ovn-encap-type' external-id. */ >> >> >> struct sset encap_type_set; >> >> >> - /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. */ >> >> >> + /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */ >> >> >> struct sset encap_ip_set; >> >> >> /* Interface type list formatted in the OVN-SB Chassis required format. */ >> >> >> struct ds iface_types; >> >> >> diff --git a/controller/local_data.c b/controller/local_data.c >> >> >> index a9092783958f..8606414f8728 100644 >> >> >> --- a/controller/local_data.c >> >> >> +++ b/controller/local_data.c >> >> >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap *chassis_tunnels) >> >> >> */ >> >> >> struct chassis_tunnel * >> >> >> chassis_tunnel_find(const struct hmap *chassis_tunnels, const char *chassis_id, >> >> >> - char *remote_encap_ip, char *local_encap_ip) >> >> >> + char *remote_encap_ip, const char *local_encap_ip) >> >> > >> >> > >> >> > nit: Unrelated change. >> >> >> >> >> >> Ack >> >> Hi Ales, sorry that I just realized this change is related, which is because of the const char * array introduced in this patch that stores the parsed encap_ips, and it makes sense to use const char * since we should never expect this string to be modified in the function. >> >> >> >> >> > >> >> > >> >> >> { >> >> >> /* >> >> >> * If the specific encap_ip is given, look for the chassisid_ip entry, >> >> >> diff --git a/controller/local_data.h b/controller/local_data.h >> >> >> index bab95bcc3824..ca3905bd20e6 100644 >> >> >> --- a/controller/local_data.h >> >> >> +++ b/controller/local_data.h >> >> >> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes( >> >> >> struct chassis_tunnel *chassis_tunnel_find(const struct hmap *chassis_tunnels, >> >> >> const char *chassis_id, >> >> >> char *remote_encap_ip, >> >> >> - char *local_encap_ip); >> >> >> + const char *local_encap_ip); >> >> > >> >> > >> >> > Same as above. >> >> >> >> >> >> Ack >> >> >> >> > >> >> > >> >> >> >> >> >> bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels, >> >> >> const char *chassis_name, >> >> >> diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml >> >> >> index efa65e3fd927..5ebef048d721 100644 >> >> >> --- a/controller/ovn-controller.8.xml >> >> >> +++ b/controller/ovn-controller.8.xml >> >> >> @@ -176,10 +176,32 @@ >> >> >> >> >> >> <dt><code>external_ids:ovn-encap-ip</code></dt> >> >> >> <dd> >> >> >> - The IP address that a chassis should use to connect to this node >> >> >> - using encapsulation types specified by >> >> >> - <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs >> >> >> - may be specified with a comma-separated list. >> >> >> + <p> >> >> >> + The IP address that a chassis should use to connect to this node >> >> >> + using encapsulation types specified by >> >> >> + <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs >> >> >> + may be specified with a comma-separated list. >> >> >> + </p> >> >> >> + <p> >> >> >> + In scenarios where multiple encapsulation IPs are present, distinct >> >> >> + tunnels are established for each remote chassis. These tunnels are >> >> >> + differentiated by setting unique <code>options:local_ip</code> and >> >> >> + <code>options:remote_ip</code> values in the tunnel interface. When >> >> >> + transmitting a packet to a remote chassis, the selection of local_ip >> >> >> + is guided by the <code>Interface:external_ids:encap-ip</code> from >> >> >> + the local OVSDB, corresponding to the VIF originating the packet, if >> >> >> + specified. The <code>Interface:external_ids:encap-ip</code> setting >> >> >> + of the VIF is also populated to the <code>Port_Binding</code> >> >> >> + table in the OVN SB database via the <code>encap</code> column. >> >> >> + Consequently, when a remote chassis needs to send a packet to a >> >> >> + port-binding associated with this VIF, it utilizes the tunnel with >> >> >> + the appropriate <code>options:remote_ip</code> that matches the >> >> >> + <code>ip</code> in <code>Port_Binding:encap</code>. This mechanism >> >> >> + is particularly beneficial for chassis with multiple physical >> >> >> + interfaces designated for tunneling, where each interface is >> >> >> + optimized for handling specific traffic associated with particular >> >> >> + VIFs. >> >> >> + </p> >> >> >> </dd> >> >> >> >> >> >> <dt><code>external_ids:ovn-encap-df_default</code></dt> >> >> >> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c >> >> >> index 856e5e270822..4ab57fe970fe 100644 >> >> >> --- a/controller/ovn-controller.c >> >> >> +++ b/controller/ovn-controller.c >> >> >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output { >> >> >> struct physical_debug debug; >> >> >> }; >> >> >> >> >> >> +static void >> >> >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, >> >> >> + size_t *n_encap_ips, const char * **encap_ips) >> >> >> +{ >> >> >> + const struct ovsrec_open_vswitch *cfg = >> >> >> + ovsrec_open_vswitch_table_first(ovs_table); >> >> >> + const char *chassis_id = get_ovs_chassis_id(ovs_table); >> >> >> + const char *encap_ips_str = >> >> >> + get_chassis_external_id_value(&cfg->external_ids, chassis_id, >> >> >> + "ovn-encap-ip", NULL); >> >> >> + struct sset encap_ip_set; >> >> >> + sset_init(&encap_ip_set); >> >> >> + sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); >> >> >> + >> >> >> + /* Sort the ips so that their index is deterministic. */ >> >> >> + *encap_ips = sset_sort(&encap_ip_set); >> >> >> + >> >> >> + /* Copy the strings so that we can destroy the sset. */ >> >> >> + for (size_t i = 0; (*encap_ips)[i]; i++) { >> >> >> + (*encap_ips)[i] = xstrdup((*encap_ips)[i]); >> >> >> + } >> >> >> + *n_encap_ips = sset_count(&encap_ip_set); >> >> >> + sset_destroy(&encap_ip_set); >> >> >> +} >> >> >> + >> >> > >> >> > >> >> > I wonder, could we store this array or maybe preferably svec in "en_non_vif_data" I-P node? This way it would be handled in the node and we could avoid the destroy calls after any pflow processing WDYT? >> >> >> >> >> >> Yes we could do the same in en_non_vif_data, but I think it is not really necessary and it seems more straightforward just parsing it here, because: >> >> 1. We don't need I-P for this ovn-encap-ip configuration change, so we don't have to persist this data. >> >> 2. It should be very rare to have more than 5 (or even 3) encap IPs per node, so the list should be very small and the cost of this parsing (and sset construct/destroy) is negligible. >> >> Using svec is also a valid option, but the sset (and array) is used here just to reuse the sset_from_delimited_string and sset_sort for convenience. I noticed that the sset_init() call can in fact be removed because sset_from_delimited_string already does that. I can remove it. >> >> Does this sound reasonable to you? >> > >> > >> > It makes sense, the main thing it would help with is the need to destroy the ctx in kinda unexpected places which might be slightly error prone. However, it being functionally the same it's fine either way. >> > >> >> Thanks Ales. So I will add the below small change. Would you give your Ack if it looks good to you? >> ----------------------- >> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c >> index 4ab57fe970fe..6873c02288c6 100644 >> --- a/controller/ovn-controller.c >> +++ b/controller/ovn-controller.c >> @@ -4532,7 +4532,6 @@ parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, >> get_chassis_external_id_value(&cfg->external_ids, chassis_id, >> "ovn-encap-ip", NULL); >> struct sset encap_ip_set; >> - sset_init(&encap_ip_set); >> sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); >> >> /* Sort the ips so that their index is deterministic. */ >> @@ -4615,8 +4614,6 @@ static void init_physical_ctx(struct engine_node *node, >> p_ctx->patch_ofports = &non_vif_data->patch_ofports; >> p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; >> >> - >> - >> struct controller_engine_ctx *ctrl_ctx = engine_get_context()->client_ctx; >> p_ctx->if_mgr = ctrl_ctx->if_mgr; >> ------------------------- >> >> Thanks, >> Han > > > > Yeah the diff looks ok. > > Acked-by: Ales Musil <amusil@redhat.com> > > Thanks, > Ales > Thanks Ales, I applied the series with the above diff. Han >> >> >> > Thanks, >> > Ales >> > >> >> >> >> >> >> Thanks, >> >> Han >> >> >> >> > >> >> >> >> >> >> static void init_physical_ctx(struct engine_node *node, >> >> >> struct ed_type_runtime_data *rt_data, >> >> >> struct ed_type_non_vif_data *non_vif_data, >> >> >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node *node, >> >> >> engine_get_input_data("ct_zones", node); >> >> >> struct simap *ct_zones = &ct_zones_data->current; >> >> >> >> >> >> + parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips); >> >> >> p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; >> >> >> p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath; >> >> >> p_ctx->port_binding_table = port_binding_table; >> >> >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct engine_node *node, >> >> >> p_ctx->patch_ofports = &non_vif_data->patch_ofports; >> >> >> p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; >> >> >> >> >> >> + >> >> >> + >> >> >> >> >> > >> >> > nit: Unrelated change. >> >> >> >> >> >> Ack >> >> > >> >> > >> >> >> >> >> >> struct controller_engine_ctx *ctrl_ctx = engine_get_context()->client_ctx; >> >> >> p_ctx->if_mgr = ctrl_ctx->if_mgr; >> >> >> >> >> >> pflow_output_get_debug(node, &p_ctx->debug); >> >> >> } >> >> >> >> >> >> +static void >> >> >> +destroy_physical_ctx(struct physical_ctx *p_ctx) >> >> >> +{ >> >> >> + for (size_t i = 0; i < p_ctx->n_encap_ips; i++) { >> >> >> + free((char *)(p_ctx->encap_ips[i])); >> >> >> + } >> >> >> + free(p_ctx->encap_ips); >> >> >> +} >> >> >> + >> >> >> static void * >> >> >> en_pflow_output_init(struct engine_node *node OVS_UNUSED, >> >> >> struct engine_arg *arg OVS_UNUSED) >> >> >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, void *data) >> >> >> struct physical_ctx p_ctx; >> >> >> init_physical_ctx(node, rt_data, non_vif_data, &p_ctx); >> >> >> physical_run(&p_ctx, pflow_table); >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> >> } >> >> >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct engine_node *node, >> >> >> bool removed = sbrec_port_binding_is_deleted(binding); >> >> >> if (!physical_handle_flows_for_lport(binding, removed, &p_ctx, >> >> >> &pfo->flow_table)) { >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return false; >> >> >> } >> >> >> } >> >> >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct engine_node *node, >> >> >> bool removed = sbrec_port_binding_is_deleted(pb); >> >> >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, >> >> >> &pfo->flow_table)) { >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return false; >> >> >> } >> >> >> } >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> >> } >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return true; >> >> >> } >> >> >> >> >> >> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct engine_node *node, >> >> >> bool removed = sbrec_port_binding_is_deleted(pb); >> >> >> if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, >> >> >> &pfo->flow_table)) { >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return false; >> >> >> } >> >> >> } >> >> >> >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return true; >> >> >> } >> >> >> >> >> >> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct engine_node *node, void *data) >> >> >> physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table); >> >> >> >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return true; >> >> >> } >> >> >> >> >> >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct engine_node *node, void *data) >> >> >> if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) { >> >> >> /* Fall back to full recompute when a local datapath >> >> >> * is added or deleted. */ >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return false; >> >> >> } >> >> >> >> >> >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct engine_node *node, void *data) >> >> >> lport->tracked_type == TRACKED_RESOURCE_REMOVED ? true: false; >> >> >> if (!physical_handle_flows_for_lport(lport->pb, removed, &p_ctx, >> >> >> &pfo->flow_table)) { >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return false; >> >> >> } >> >> >> } >> >> >> } >> >> >> >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return true; >> >> >> } >> >> >> >> >> >> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct engine_node *node, void *data) >> >> >> if (pb) { >> >> >> if (!physical_handle_flows_for_lport(pb, false, &p_ctx, >> >> >> &pfo->flow_table)) { >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return false; >> >> >> } >> >> >> tag_port_as_activated_in_engine(pp); >> >> >> } >> >> >> } >> >> >> engine_set_node_state(node, EN_UPDATED); >> >> >> + destroy_physical_ctx(&p_ctx); >> >> >> return true; >> >> >> } >> >> >> >> >> >> diff --git a/controller/physical.c b/controller/physical.c >> >> >> index e93bfbd2cffb..c192aed751d5 100644 >> >> >> --- a/controller/physical.c >> >> >> +++ b/controller/physical.c >> >> >> @@ -71,6 +71,8 @@ struct tunnel { >> >> >> static void >> >> >> load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> >> >> const struct zone_ids *zone_ids, >> >> >> + size_t n_encap_ips, >> >> >> + const char **encap_ips, >> >> >> struct ofpbuf *ofpacts_p); >> >> >> static int64_t get_vxlan_port_key(int64_t port_key); >> >> >> >> >> >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf *ofpacts) >> >> >> */ >> >> >> static struct chassis_tunnel * >> >> >> get_port_binding_tun(const struct sbrec_encap *remote_encap, >> >> >> - const struct sbrec_encap *local_encap, >> >> >> const struct sbrec_chassis *chassis, >> >> >> - const struct hmap *chassis_tunnels) >> >> >> + const struct hmap *chassis_tunnels, >> >> >> + const char *local_encap_ip) >> >> >> { >> >> >> struct chassis_tunnel *tun = NULL; >> >> >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, >> >> >> remote_encap ? remote_encap->ip : NULL, >> >> >> - local_encap ? local_encap->ip : NULL); >> >> >> + local_encap_ip); >> >> >> >> >> >> if (!tun) { >> >> >> tun = chassis_tunnel_find(chassis_tunnels, chassis->name, NULL, NULL); >> >> >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct sbrec_port_binding *pb, >> >> >> static struct ovs_list * >> >> >> get_remote_tunnels(const struct sbrec_port_binding *binding, >> >> >> const struct sbrec_chassis *chassis, >> >> >> - const struct hmap *chassis_tunnels) >> >> >> + const struct hmap *chassis_tunnels, >> >> >> + const char *local_encap_ip) >> >> >> { >> >> >> const struct chassis_tunnel *tun; >> >> >> >> >> >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, >> >> >> ovs_list_init(tunnels); >> >> >> >> >> >> if (binding->chassis && binding->chassis != chassis) { >> >> >> - tun = get_port_binding_tun(binding->encap, NULL, binding->chassis, >> >> >> - chassis_tunnels); >> >> >> + tun = get_port_binding_tun(binding->encap, binding->chassis, >> >> >> + chassis_tunnels, local_encap_ip); >> >> >> if (!tun) { >> >> >> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); >> >> >> VLOG_WARN_RL( >> >> >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, >> >> >> } >> >> >> const struct sbrec_encap *additional_encap; >> >> >> additional_encap = find_additional_encap_for_chassis(binding, chassis); >> >> >> - tun = get_port_binding_tun(additional_encap, NULL, >> >> >> + tun = get_port_binding_tun(additional_encap, >> >> >> binding->additional_chassis[i], >> >> >> - chassis_tunnels); >> >> >> + chassis_tunnels, local_encap_ip); >> >> >> if (!tun) { >> >> >> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); >> >> >> VLOG_WARN_RL( >> >> >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct sbrec_port_binding *binding, >> >> >> struct ofpbuf *ofpacts_p, >> >> >> const struct sbrec_chassis *chassis, >> >> >> const struct hmap *chassis_tunnels, >> >> >> + size_t n_encap_ips, >> >> >> + const char **encap_ips, >> >> >> struct ovn_desired_flow_table *flow_table) >> >> >> { >> >> >> /* Setup encapsulation */ >> >> >> - struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> >> >> - chassis_tunnels); >> >> >> - if (!ovs_list_is_empty(tuns)) { >> >> >> - bool is_vtep_port = !strcmp(binding->type, "vtep"); >> >> >> - /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND >> >> >> - * responder in L3 networks. */ >> >> >> - if (is_vtep_port) { >> >> >> - put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_p); >> >> >> - } >> >> >> + for (size_t i = 0; i < n_encap_ips; i++) { >> >> >> + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); >> >> >> >> >> >> + >> >> >> + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16, >> >> >> + (uint32_t) 0xFFFF << 16); >> >> >> + struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> >> >> + chassis_tunnels, >> >> >> + encap_ips[i]); >> >> >> + if (!ovs_list_is_empty(tuns)) { >> >> >> + bool is_vtep_port = !strcmp(binding->type, "vtep"); >> >> >> + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND >> >> >> + * responder in L3 networks. */ >> >> >> + if (is_vtep_port) { >> >> >> + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, >> >> >> + ofpacts_clone); >> >> >> + } >> >> >> >> >> >> - struct tunnel *tun; >> >> >> - LIST_FOR_EACH (tun, list_node, tuns) { >> >> >> - put_encapsulation(mff_ovn_geneve, tun->tun, >> >> >> - binding->datapath, port_key, is_vtep_port, >> >> >> - ofpacts_p); >> >> >> - ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport; >> >> >> + struct tunnel *tun; >> >> >> + LIST_FOR_EACH (tun, list_node, tuns) { >> >> >> + put_encapsulation(mff_ovn_geneve, tun->tun, >> >> >> + binding->datapath, port_key, is_vtep_port, >> >> >> + ofpacts_clone); >> >> >> + ofpact_put_OUTPUT(ofpacts_clone)->port = tun->tun->ofport; >> >> >> + } >> >> >> + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); >> >> >> + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, >> >> >> + binding->header_.uuid.parts[0], match, >> >> >> + ofpacts_clone, &binding->header_.uuid); >> >> >> } >> >> >> - put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p); >> >> >> - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, >> >> >> - binding->header_.uuid.parts[0], match, ofpacts_p, >> >> >> - &binding->header_.uuid); >> >> >> - } >> >> >> - struct tunnel *tun_elem; >> >> >> - LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { >> >> >> - free(tun_elem); >> >> >> + struct tunnel *tun_elem; >> >> >> + LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { >> >> >> + free(tun_elem); >> >> >> + } >> >> >> + free(tuns); >> >> >> + >> >> >> + ofpbuf_delete(ofpacts_clone); >> >> >> } >> >> >> - free(tuns); >> >> >> } >> >> >> >> >> >> static void >> >> >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap *ct_zones, >> >> >> if (tag) { >> >> >> ofpact_put_STRIP_VLAN(ofpacts_p); >> >> >> } >> >> >> - load_logical_ingress_metadata(localnet_port, &zone_ids, ofpacts_p); >> >> >> + load_logical_ingress_metadata(localnet_port, &zone_ids, 0, NULL, >> >> >> + ofpacts_p); >> >> >> replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p); >> >> >> replace_mac->mac = router_port_mac; >> >> >> >> >> >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids, struct ofpbuf *ofpacts_p) >> >> >> { >> >> >> if (zone_ids) { >> >> >> if (zone_ids->ct) { >> >> >> - put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); >> >> >> + put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); >> >> >> } >> >> >> if (zone_ids->dnat) { >> >> >> put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); >> >> >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key, >> >> >> } >> >> >> } >> >> >> >> >> >> +static size_t >> >> >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips, >> >> >> + const char *ip) >> >> >> +{ >> >> >> + for (size_t i = 0; i < n_encap_ips; i++) { >> >> >> + if (!strcmp(ip, encap_ips[i])) { >> >> >> + return i; >> >> >> + } >> >> >> + } >> >> >> + return 0; >> >> >> +} >> >> >> + >> >> >> static void >> >> >> load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> >> >> const struct zone_ids *zone_ids, >> >> >> + size_t n_encap_ips, >> >> >> + const char **encap_ips, >> >> >> struct ofpbuf *ofpacts_p) >> >> >> { >> >> >> put_zones_ofpacts(zone_ids, ofpacts_p); >> >> >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct sbrec_port_binding *binding, >> >> >> uint32_t port_key = binding->tunnel_key; >> >> >> put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p); >> >> >> put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p); >> >> >> + >> >> >> + /* Default encap_id 0. */ >> >> >> + size_t encap_id = 0; >> >> >> + if (encap_ips && binding->encap) { >> >> >> + encap_id = encap_ip_to_id(n_encap_ips, encap_ips, >> >> >> + binding->encap->ip); >> >> >> + } >> >> >> + put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p); >> >> >> } >> >> >> >> >> >> static const struct sbrec_port_binding * >> >> >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct sbrec_port_binding *binding, >> >> >> match_set_dl_type(&match, htons(ETH_TYPE_RARP)); >> >> >> match_set_in_port(&match, ofport); >> >> >> >> >> >> - load_logical_ingress_metadata(binding, zone_ids, &ofpacts); >> >> >> + load_logical_ingress_metadata(binding, zone_ids, 0, NULL, &ofpacts); >> >> >> >> >> >> encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP, >> >> >> NX_CTLR_NO_METER, &ofpacts); >> >> >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports( >> >> >> } >> >> >> >> >> >> struct ovs_list *tuns = get_remote_tunnels(binding, chassis, >> >> >> - chassis_tunnels); >> >> >> + chassis_tunnels, NULL); >> >> >> if (ovs_list_is_empty(tuns)) { >> >> >> free(tuns); >> >> >> return; >> >> >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> >> const struct sbrec_chassis *chassis, >> >> >> const struct physical_debug *debug, >> >> >> const struct if_status_mgr *if_mgr, >> >> >> + size_t n_encap_ips, >> >> >> + const char **encap_ips, >> >> >> struct ovn_desired_flow_table *flow_table, >> >> >> struct ofpbuf *ofpacts_p) >> >> >> { >> >> >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> >> ofpact_put_CT_CLEAR(ofpacts_p); >> >> >> put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); >> >> >> put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p); >> >> >> - put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); >> >> >> + put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); >> >> >> struct zone_ids peer_zones = get_zone_ids(peer, ct_zones); >> >> >> - load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p); >> >> >> + load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips, >> >> >> + encap_ips, ofpacts_p); >> >> >> put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p); >> >> >> put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p); >> >> >> for (int i = 0; i < MFF_N_LOG_REGS; i++) { >> >> >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> >> * as we're going to remove this with ofpbuf_pull() later. */ >> >> >> uint32_t ofpacts_orig_size = ofpacts_p->size; >> >> >> >> >> >> - load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p); >> >> >> + load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips, >> >> >> + encap_ips, ofpacts_p); >> >> >> >> >> >> if (!strcmp(binding->type, "localport")) { >> >> >> /* mark the packet as incoming from a localport */ >> >> >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> >> } else { >> >> >> put_remote_port_redirect_overlay( >> >> >> binding, mff_ovn_geneve, port_key, &match, ofpacts_p, >> >> >> - chassis, chassis_tunnels, flow_table); >> >> >> + chassis, chassis_tunnels, n_encap_ips, encap_ips, flow_table); >> >> >> } >> >> >> out: >> >> >> if (ha_ch_ordered) { >> >> >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, >> >> >> >> >> >> int zone_id = simap_get(ct_zones, port->logical_port); >> >> >> if (zone_id) { >> >> >> - put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts); >> >> >> + put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts); >> >> >> } >> >> >> >> >> >> const char *lport_name = (port->parent_port && *port->parent_port) ? >> >> >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx *p_ctx, >> >> >> p_ctx->patch_ofports, >> >> >> p_ctx->chassis_tunnels, >> >> >> pb, p_ctx->chassis, &p_ctx->debug, >> >> >> - p_ctx->if_mgr, flow_table, &ofpacts); >> >> >> + p_ctx->if_mgr, >> >> >> + p_ctx->n_encap_ips, >> >> >> + p_ctx->encap_ips, >> >> >> + flow_table, &ofpacts); >> >> >> ofpbuf_uninit(&ofpacts); >> >> >> } >> >> >> >> >> >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx, >> >> >> p_ctx->patch_ofports, >> >> >> p_ctx->chassis_tunnels, binding, >> >> >> p_ctx->chassis, &p_ctx->debug, >> >> >> - p_ctx->if_mgr, flow_table, &ofpacts); >> >> >> + p_ctx->if_mgr, >> >> >> + p_ctx->n_encap_ips, >> >> >> + p_ctx->encap_ips, >> >> >> + flow_table, &ofpacts); >> >> >> } >> >> >> >> >> >> /* Handle output to multicast groups, in tables 40 and 41. */ >> >> >> diff --git a/controller/physical.h b/controller/physical.h >> >> >> index 1f1ed55efa16..7fe8ee3c18ed 100644 >> >> >> --- a/controller/physical.h >> >> >> +++ b/controller/physical.h >> >> >> @@ -66,6 +66,8 @@ struct physical_ctx { >> >> >> struct shash *local_bindings; >> >> >> struct simap *patch_ofports; >> >> >> struct hmap *chassis_tunnels; >> >> >> + size_t n_encap_ips; >> >> >> + const char **encap_ips; >> >> >> struct physical_debug debug; >> >> >> }; >> >> >> >> >> >> diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h >> >> >> index 272277ec4fa0..d3455a462a0d 100644 >> >> >> --- a/include/ovn/logical-fields.h >> >> >> +++ b/include/ovn/logical-fields.h >> >> >> @@ -37,7 +37,9 @@ enum ovn_controller_event { >> >> >> #define MFF_LOG_SNAT_ZONE MFF_REG12 /* conntrack snat zone for gateway router >> >> >> * (32 bits). */ >> >> >> #define MFF_LOG_CT_ZONE MFF_REG13 /* Logical conntrack zone for lports >> >> >> - * (32 bits). */ >> >> >> + * (0..15 of the 32 bits). */ >> >> >> +#define MFF_LOG_ENCAP_ID MFF_REG13 /* Encap ID for lports >> >> >> + * (16..31 of the 32 bits). */ >> >> >> #define MFF_LOG_INPORT MFF_REG14 /* Logical input port (32 bits). */ >> >> >> #define MFF_LOG_OUTPORT MFF_REG15 /* Logical output port (32 bits). */ >> >> >> >> >> >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml >> >> >> index 96294fe2b795..bfd8680cedfc 100644 >> >> >> --- a/ovn-architecture.7.xml >> >> >> +++ b/ovn-architecture.7.xml >> >> >> @@ -1247,8 +1247,8 @@ >> >> >> chassis. This is initialized to 0 at the beginning of the logical >> >> >> <!-- Keep the following in sync with MFF_LOG_CT_ZONE in >> >> >> ovn/lib/logical-fields.h. --> >> >> >> - ingress pipeline. OVN stores this in Open vSwitch extension register >> >> >> - number 13. >> >> >> + ingress pipeline. OVN stores this in the lower 16 bits of the Open >> >> >> + vSwitch extension register number 13. >> >> >> </dd> >> >> >> >> >> >> <dt>conntrack zone fields for routers</dt> >> >> >> @@ -1263,6 +1263,20 @@ >> >> >> traffic (for SNATing) in Open vSwitch extension register number 12. >> >> >> </dd> >> >> >> >> >> >> + <dt>Encap ID for logical ports</dt> >> >> >> + <dd> >> >> >> + A field that records an ID that indicates which encapsulation IP should >> >> >> + be used when sending packets to a remote chassis, according to the >> >> >> + original input logical port. This is useful when there are multiple IPs >> >> >> + available for encapsulation. The value only has local significance and is >> >> >> + not meaningful between chassis. This is initialized to 0 at the beginning >> >> >> + of the logical >> >> >> + <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in >> >> >> + ovn/lib/logical-fields.h. --> >> >> >> + ingress pipeline. OVN stores this in the higher 16 bits of the Open >> >> >> + vSwitch extension register number 13. >> >> >> + </dd> >> >> >> + >> >> >> <dt>logical flow flags</dt> >> >> >> <dd> >> >> >> The logical flags are intended to handle keeping context between >> >> >> diff --git a/tests/ovn.at b/tests/ovn.at >> >> >> index 243fe0b8246c..e7f0c9681f60 100644 >> >> >> --- a/tests/ovn.at >> >> >> +++ b/tests/ovn.at >> >> >> @@ -30335,19 +30335,33 @@ AT_CLEANUP >> >> >> >> >> >> >> >> >> OVN_FOR_EACH_NORTHD([ >> >> >> -AT_SETUP([multiple encap ips tunnel creation]) >> >> >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip]) >> >> >> +AT_SKIP_IF([test $HAVE_SCAPY = no]) >> >> >> ovn_start >> >> >> net_add n1 >> >> >> >> >> >> +ovn-nbctl ls-add ls1 >> >> >> + >> >> >> # 2 HVs, each with 2 encap-ips. >> >> >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY. >> >> >> for i in 1 2; do >> >> >> sim_add hv$i >> >> >> as hv$i >> >> >> ovs-vsctl add-br br-phys-$j >> >> >> ovn_attach n1 br-phys-$j 192.168.0.${i}1 >> >> >> ovs-vsctl set open . external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 >> >> >> + >> >> >> + for j in 1 2; do >> >> >> + ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ >> >> >> + external_ids:iface-id=lsp$i$j \ >> >> >> + external_ids:encap-ip=192.168.0.$i$j \ >> >> >> + options:tx_pcap=hv$i/vif$i$j-tx.pcap options:rxq_pcap=hv$i/vif$i$j-rx.pcap >> >> >> + ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j "f0:00:00:00:00:$i$j 10.0.0.$i$j" >> >> >> + >> >> >> + done >> >> >> done >> >> >> >> >> >> +wait_for_ports_up >> >> >> check ovn-nbctl --wait=hv sync >> >> >> >> >> >> check_tunnel_port() { >> >> >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.21 >> >> >> check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22 >> >> >> check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22 >> >> >> >> >> >> +vif_to_hv() { >> >> >> + case $1 in dnl ( >> >> >> + vif[[1]]?) echo hv1 ;; dnl ( >> >> >> + vif[[2]]?) echo hv2 ;; dnl ( >> >> >> + *) AT_FAIL_IF([:]) ;; >> >> >> + esac >> >> >> +} >> >> >> + >> >> >> +# check_packet_tunnel SRC_VIF DST_VIF >> >> >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the corresponding >> >> >> +# tunnel that matches the VIFs' encap_ip configurations. >> >> >> +check_packet_tunnel() { >> >> >> + local src=$1 dst=$2 >> >> >> + local src_mac=f0:00:00:00:00:$src >> >> >> + local dst_mac=f0:00:00:00:00:$dst >> >> >> + local src_ip=10.0.0.$src >> >> >> + local dst_ip=10.0.0.$dst >> >> >> + local local_encap_ip=192.168.0.$src >> >> >> + local remote_encap_ip=192.168.0.$dst >> >> >> + local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ >> >> >> + IP(dst='${dst_ip}', src='${src_ip}')/ \ >> >> >> + ICMP(type=8)") >> >> >> + hv=`vif_to_hv vif$src` >> >> >> + as $hv >> >> >> + echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> $remote_encap_ip" >> >> >> + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) >> >> >> + AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) >> >> >> +} >> >> >> + >> >> >> +for i in 1 2; do >> >> >> + for j in 1 2; do >> >> >> + check_packet_tunnel 1$i 2$j >> >> >> + done >> >> >> +done >> >> >> + >> >> >> OVN_CLEANUP([hv1],[hv2]) >> >> >> AT_CLEANUP >> >> >> ]) >> >> >> -- >> >> >> 2.38.1 >> >> >> >> >> >> _______________________________________________ >> >> >> dev mailing list >> >> >> dev@openvswitch.org >> >> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >> >> >> >> >> > >> >> > Thanks, >> >> > Ales >> >> > >> >> > -- >> >> > >> >> > Ales Musil >> >> > >> >> > Senior Software Engineer - OVN Core >> >> > >> >> > Red Hat EMEA >> >> > >> >> > amusil@redhat.com >> > >> > >> > >> > -- >> > >> > Ales Musil >> > >> > Senior Software Engineer - OVN Core >> > >> > Red Hat EMEA >> > >> > amusil@redhat.com > > > > -- > > Ales Musil > > Senior Software Engineer - OVN Core > > Red Hat EMEA > > amusil@redhat.com
diff --git a/NEWS b/NEWS index 5f267b4c64cc..5a3eed608617 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,9 @@ Post v23.09.0 - ovn-northd-ddlog has been removed. - A new LSP option "enable_router_port_acl" has been added to enable conntrack for the router port whose peer is l3dgw_port if set it true. + - Support selecting encapsulation IP based on the source/destination VIF's + settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for more + details. OVN v23.09.0 - 15 Sep 2023 -------------------------- diff --git a/controller/chassis.c b/controller/chassis.c index a6f13ccc42d5..55f2beb37674 100644 --- a/controller/chassis.c +++ b/controller/chassis.c @@ -61,7 +61,7 @@ struct ovs_chassis_cfg { /* Set of encap types parsed from the 'ovn-encap-type' external-id. */ struct sset encap_type_set; - /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. */ + /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */ struct sset encap_ip_set; /* Interface type list formatted in the OVN-SB Chassis required format. */ struct ds iface_types; diff --git a/controller/local_data.c b/controller/local_data.c index a9092783958f..8606414f8728 100644 --- a/controller/local_data.c +++ b/controller/local_data.c @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap *chassis_tunnels) */ struct chassis_tunnel * chassis_tunnel_find(const struct hmap *chassis_tunnels, const char *chassis_id, - char *remote_encap_ip, char *local_encap_ip) + char *remote_encap_ip, const char *local_encap_ip) { /* * If the specific encap_ip is given, look for the chassisid_ip entry, diff --git a/controller/local_data.h b/controller/local_data.h index bab95bcc3824..ca3905bd20e6 100644 --- a/controller/local_data.h +++ b/controller/local_data.h @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes( struct chassis_tunnel *chassis_tunnel_find(const struct hmap *chassis_tunnels, const char *chassis_id, char *remote_encap_ip, - char *local_encap_ip); + const char *local_encap_ip); bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels, const char *chassis_name, diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml index efa65e3fd927..5ebef048d721 100644 --- a/controller/ovn-controller.8.xml +++ b/controller/ovn-controller.8.xml @@ -176,10 +176,32 @@ <dt><code>external_ids:ovn-encap-ip</code></dt> <dd> - The IP address that a chassis should use to connect to this node - using encapsulation types specified by - <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs - may be specified with a comma-separated list. + <p> + The IP address that a chassis should use to connect to this node + using encapsulation types specified by + <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs + may be specified with a comma-separated list. + </p> + <p> + In scenarios where multiple encapsulation IPs are present, distinct + tunnels are established for each remote chassis. These tunnels are + differentiated by setting unique <code>options:local_ip</code> and + <code>options:remote_ip</code> values in the tunnel interface. When + transmitting a packet to a remote chassis, the selection of local_ip + is guided by the <code>Interface:external_ids:encap-ip</code> from + the local OVSDB, corresponding to the VIF originating the packet, if + specified. The <code>Interface:external_ids:encap-ip</code> setting + of the VIF is also populated to the <code>Port_Binding</code> + table in the OVN SB database via the <code>encap</code> column. + Consequently, when a remote chassis needs to send a packet to a + port-binding associated with this VIF, it utilizes the tunnel with + the appropriate <code>options:remote_ip</code> that matches the + <code>ip</code> in <code>Port_Binding:encap</code>. This mechanism + is particularly beneficial for chassis with multiple physical + interfaces designated for tunneling, where each interface is + optimized for handling specific traffic associated with particular + VIFs. + </p> </dd> <dt><code>external_ids:ovn-encap-df_default</code></dt> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index 856e5e270822..4ab57fe970fe 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output { struct physical_debug debug; }; +static void +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table, + size_t *n_encap_ips, const char * **encap_ips) +{ + const struct ovsrec_open_vswitch *cfg = + ovsrec_open_vswitch_table_first(ovs_table); + const char *chassis_id = get_ovs_chassis_id(ovs_table); + const char *encap_ips_str = + get_chassis_external_id_value(&cfg->external_ids, chassis_id, + "ovn-encap-ip", NULL); + struct sset encap_ip_set; + sset_init(&encap_ip_set); + sset_from_delimited_string(&encap_ip_set, encap_ips_str, ","); + + /* Sort the ips so that their index is deterministic. */ + *encap_ips = sset_sort(&encap_ip_set); + + /* Copy the strings so that we can destroy the sset. */ + for (size_t i = 0; (*encap_ips)[i]; i++) { + (*encap_ips)[i] = xstrdup((*encap_ips)[i]); + } + *n_encap_ips = sset_count(&encap_ip_set); + sset_destroy(&encap_ip_set); +} + static void init_physical_ctx(struct engine_node *node, struct ed_type_runtime_data *rt_data, struct ed_type_non_vif_data *non_vif_data, @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node *node, engine_get_input_data("ct_zones", node); struct simap *ct_zones = &ct_zones_data->current; + parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips); p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath; p_ctx->port_binding_table = port_binding_table; @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct engine_node *node, p_ctx->patch_ofports = &non_vif_data->patch_ofports; p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; + + struct controller_engine_ctx *ctrl_ctx = engine_get_context()->client_ctx; p_ctx->if_mgr = ctrl_ctx->if_mgr; pflow_output_get_debug(node, &p_ctx->debug); } +static void +destroy_physical_ctx(struct physical_ctx *p_ctx) +{ + for (size_t i = 0; i < p_ctx->n_encap_ips; i++) { + free((char *)(p_ctx->encap_ips[i])); + } + free(p_ctx->encap_ips); +} + static void * en_pflow_output_init(struct engine_node *node OVS_UNUSED, struct engine_arg *arg OVS_UNUSED) @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, void *data) struct physical_ctx p_ctx; init_physical_ctx(node, rt_data, non_vif_data, &p_ctx); physical_run(&p_ctx, pflow_table); + destroy_physical_ctx(&p_ctx); engine_set_node_state(node, EN_UPDATED); } @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct engine_node *node, bool removed = sbrec_port_binding_is_deleted(binding); if (!physical_handle_flows_for_lport(binding, removed, &p_ctx, &pfo->flow_table)) { + destroy_physical_ctx(&p_ctx); return false; } } @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct engine_node *node, bool removed = sbrec_port_binding_is_deleted(pb); if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, &pfo->flow_table)) { + destroy_physical_ctx(&p_ctx); return false; } } engine_set_node_state(node, EN_UPDATED); } + destroy_physical_ctx(&p_ctx); return true; } @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct engine_node *node, bool removed = sbrec_port_binding_is_deleted(pb); if (!physical_handle_flows_for_lport(pb, removed, &p_ctx, &pfo->flow_table)) { + destroy_physical_ctx(&p_ctx); return false; } } engine_set_node_state(node, EN_UPDATED); + destroy_physical_ctx(&p_ctx); return true; } @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct engine_node *node, void *data) physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table); engine_set_node_state(node, EN_UPDATED); + destroy_physical_ctx(&p_ctx); return true; } @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct engine_node *node, void *data) if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) { /* Fall back to full recompute when a local datapath * is added or deleted. */ + destroy_physical_ctx(&p_ctx); return false; } @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct engine_node *node, void *data) lport->tracked_type == TRACKED_RESOURCE_REMOVED ? true: false; if (!physical_handle_flows_for_lport(lport->pb, removed, &p_ctx, &pfo->flow_table)) { + destroy_physical_ctx(&p_ctx); return false; } } } engine_set_node_state(node, EN_UPDATED); + destroy_physical_ctx(&p_ctx); return true; } @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct engine_node *node, void *data) if (pb) { if (!physical_handle_flows_for_lport(pb, false, &p_ctx, &pfo->flow_table)) { + destroy_physical_ctx(&p_ctx); return false; } tag_port_as_activated_in_engine(pp); } } engine_set_node_state(node, EN_UPDATED); + destroy_physical_ctx(&p_ctx); return true; } diff --git a/controller/physical.c b/controller/physical.c index e93bfbd2cffb..c192aed751d5 100644 --- a/controller/physical.c +++ b/controller/physical.c @@ -71,6 +71,8 @@ struct tunnel { static void load_logical_ingress_metadata(const struct sbrec_port_binding *binding, const struct zone_ids *zone_ids, + size_t n_encap_ips, + const char **encap_ips, struct ofpbuf *ofpacts_p); static int64_t get_vxlan_port_key(int64_t port_key); @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf *ofpacts) */ static struct chassis_tunnel * get_port_binding_tun(const struct sbrec_encap *remote_encap, - const struct sbrec_encap *local_encap, const struct sbrec_chassis *chassis, - const struct hmap *chassis_tunnels) + const struct hmap *chassis_tunnels, + const char *local_encap_ip) { struct chassis_tunnel *tun = NULL; tun = chassis_tunnel_find(chassis_tunnels, chassis->name, remote_encap ? remote_encap->ip : NULL, - local_encap ? local_encap->ip : NULL); + local_encap_ip); if (!tun) { tun = chassis_tunnel_find(chassis_tunnels, chassis->name, NULL, NULL); @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct sbrec_port_binding *pb, static struct ovs_list * get_remote_tunnels(const struct sbrec_port_binding *binding, const struct sbrec_chassis *chassis, - const struct hmap *chassis_tunnels) + const struct hmap *chassis_tunnels, + const char *local_encap_ip) { const struct chassis_tunnel *tun; @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, ovs_list_init(tunnels); if (binding->chassis && binding->chassis != chassis) { - tun = get_port_binding_tun(binding->encap, NULL, binding->chassis, - chassis_tunnels); + tun = get_port_binding_tun(binding->encap, binding->chassis, + chassis_tunnels, local_encap_ip); if (!tun) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); VLOG_WARN_RL( @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, } const struct sbrec_encap *additional_encap; additional_encap = find_additional_encap_for_chassis(binding, chassis); - tun = get_port_binding_tun(additional_encap, NULL, + tun = get_port_binding_tun(additional_encap, binding->additional_chassis[i], - chassis_tunnels); + chassis_tunnels, local_encap_ip); if (!tun) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); VLOG_WARN_RL( @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct sbrec_port_binding *binding, struct ofpbuf *ofpacts_p, const struct sbrec_chassis *chassis, const struct hmap *chassis_tunnels, + size_t n_encap_ips, + const char **encap_ips, struct ovn_desired_flow_table *flow_table) { /* Setup encapsulation */ - struct ovs_list *tuns = get_remote_tunnels(binding, chassis, - chassis_tunnels); - if (!ovs_list_is_empty(tuns)) { - bool is_vtep_port = !strcmp(binding->type, "vtep"); - /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND - * responder in L3 networks. */ - if (is_vtep_port) { - put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_p); - } + for (size_t i = 0; i < n_encap_ips; i++) { + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); + + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16, + (uint32_t) 0xFFFF << 16); + struct ovs_list *tuns = get_remote_tunnels(binding, chassis, + chassis_tunnels, + encap_ips[i]); + if (!ovs_list_is_empty(tuns)) { + bool is_vtep_port = !strcmp(binding->type, "vtep"); + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND + * responder in L3 networks. */ + if (is_vtep_port) { + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, + ofpacts_clone); + } - struct tunnel *tun; - LIST_FOR_EACH (tun, list_node, tuns) { - put_encapsulation(mff_ovn_geneve, tun->tun, - binding->datapath, port_key, is_vtep_port, - ofpacts_p); - ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport; + struct tunnel *tun; + LIST_FOR_EACH (tun, list_node, tuns) { + put_encapsulation(mff_ovn_geneve, tun->tun, + binding->datapath, port_key, is_vtep_port, + ofpacts_clone); + ofpact_put_OUTPUT(ofpacts_clone)->port = tun->tun->ofport; + } + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, + binding->header_.uuid.parts[0], match, + ofpacts_clone, &binding->header_.uuid); } - put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p); - ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, - binding->header_.uuid.parts[0], match, ofpacts_p, - &binding->header_.uuid); - } - struct tunnel *tun_elem; - LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { - free(tun_elem); + struct tunnel *tun_elem; + LIST_FOR_EACH_POP (tun_elem, list_node, tuns) { + free(tun_elem); + } + free(tuns); + + ofpbuf_delete(ofpacts_clone); } - free(tuns); } static void @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap *ct_zones, if (tag) { ofpact_put_STRIP_VLAN(ofpacts_p); } - load_logical_ingress_metadata(localnet_port, &zone_ids, ofpacts_p); + load_logical_ingress_metadata(localnet_port, &zone_ids, 0, NULL, + ofpacts_p); replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p); replace_mac->mac = router_port_mac; @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids, struct ofpbuf *ofpacts_p) { if (zone_ids) { if (zone_ids->ct) { - put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); + put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); } if (zone_ids->dnat) { put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key, } } +static size_t +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips, + const char *ip) +{ + for (size_t i = 0; i < n_encap_ips; i++) { + if (!strcmp(ip, encap_ips[i])) { + return i; + } + } + return 0; +} + static void load_logical_ingress_metadata(const struct sbrec_port_binding *binding, const struct zone_ids *zone_ids, + size_t n_encap_ips, + const char **encap_ips, struct ofpbuf *ofpacts_p) { put_zones_ofpacts(zone_ids, ofpacts_p); @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct sbrec_port_binding *binding, uint32_t port_key = binding->tunnel_key; put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p); put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p); + + /* Default encap_id 0. */ + size_t encap_id = 0; + if (encap_ips && binding->encap) { + encap_id = encap_ip_to_id(n_encap_ips, encap_ips, + binding->encap->ip); + } + put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p); } static const struct sbrec_port_binding * @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct sbrec_port_binding *binding, match_set_dl_type(&match, htons(ETH_TYPE_RARP)); match_set_in_port(&match, ofport); - load_logical_ingress_metadata(binding, zone_ids, &ofpacts); + load_logical_ingress_metadata(binding, zone_ids, 0, NULL, &ofpacts); encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP, NX_CTLR_NO_METER, &ofpacts); @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports( } struct ovs_list *tuns = get_remote_tunnels(binding, chassis, - chassis_tunnels); + chassis_tunnels, NULL); if (ovs_list_is_empty(tuns)) { free(tuns); return; @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, const struct sbrec_chassis *chassis, const struct physical_debug *debug, const struct if_status_mgr *if_mgr, + size_t n_encap_ips, + const char **encap_ips, struct ovn_desired_flow_table *flow_table, struct ofpbuf *ofpacts_p) { @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, ofpact_put_CT_CLEAR(ofpacts_p); put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p); put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p); - put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p); + put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p); struct zone_ids peer_zones = get_zone_ids(peer, ct_zones); - load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p); + load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips, + encap_ips, ofpacts_p); put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p); put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p); for (int i = 0; i < MFF_N_LOG_REGS; i++) { @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, * as we're going to remove this with ofpbuf_pull() later. */ uint32_t ofpacts_orig_size = ofpacts_p->size; - load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p); + load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips, + encap_ips, ofpacts_p); if (!strcmp(binding->type, "localport")) { /* mark the packet as incoming from a localport */ @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name, } else { put_remote_port_redirect_overlay( binding, mff_ovn_geneve, port_key, &match, ofpacts_p, - chassis, chassis_tunnels, flow_table); + chassis, chassis_tunnels, n_encap_ips, encap_ips, flow_table); } out: if (ha_ch_ordered) { @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, int zone_id = simap_get(ct_zones, port->logical_port); if (zone_id) { - put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts); + put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts); } const char *lport_name = (port->parent_port && *port->parent_port) ? @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx *p_ctx, p_ctx->patch_ofports, p_ctx->chassis_tunnels, pb, p_ctx->chassis, &p_ctx->debug, - p_ctx->if_mgr, flow_table, &ofpacts); + p_ctx->if_mgr, + p_ctx->n_encap_ips, + p_ctx->encap_ips, + flow_table, &ofpacts); ofpbuf_uninit(&ofpacts); } @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx, p_ctx->patch_ofports, p_ctx->chassis_tunnels, binding, p_ctx->chassis, &p_ctx->debug, - p_ctx->if_mgr, flow_table, &ofpacts); + p_ctx->if_mgr, + p_ctx->n_encap_ips, + p_ctx->encap_ips, + flow_table, &ofpacts); } /* Handle output to multicast groups, in tables 40 and 41. */ diff --git a/controller/physical.h b/controller/physical.h index 1f1ed55efa16..7fe8ee3c18ed 100644 --- a/controller/physical.h +++ b/controller/physical.h @@ -66,6 +66,8 @@ struct physical_ctx { struct shash *local_bindings; struct simap *patch_ofports; struct hmap *chassis_tunnels; + size_t n_encap_ips; + const char **encap_ips; struct physical_debug debug; }; diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h index 272277ec4fa0..d3455a462a0d 100644 --- a/include/ovn/logical-fields.h +++ b/include/ovn/logical-fields.h @@ -37,7 +37,9 @@ enum ovn_controller_event { #define MFF_LOG_SNAT_ZONE MFF_REG12 /* conntrack snat zone for gateway router * (32 bits). */ #define MFF_LOG_CT_ZONE MFF_REG13 /* Logical conntrack zone for lports - * (32 bits). */ + * (0..15 of the 32 bits). */ +#define MFF_LOG_ENCAP_ID MFF_REG13 /* Encap ID for lports + * (16..31 of the 32 bits). */ #define MFF_LOG_INPORT MFF_REG14 /* Logical input port (32 bits). */ #define MFF_LOG_OUTPORT MFF_REG15 /* Logical output port (32 bits). */ diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml index 96294fe2b795..bfd8680cedfc 100644 --- a/ovn-architecture.7.xml +++ b/ovn-architecture.7.xml @@ -1247,8 +1247,8 @@ chassis. This is initialized to 0 at the beginning of the logical <!-- Keep the following in sync with MFF_LOG_CT_ZONE in ovn/lib/logical-fields.h. --> - ingress pipeline. OVN stores this in Open vSwitch extension register - number 13. + ingress pipeline. OVN stores this in the lower 16 bits of the Open + vSwitch extension register number 13. </dd> <dt>conntrack zone fields for routers</dt> @@ -1263,6 +1263,20 @@ traffic (for SNATing) in Open vSwitch extension register number 12. </dd> + <dt>Encap ID for logical ports</dt> + <dd> + A field that records an ID that indicates which encapsulation IP should + be used when sending packets to a remote chassis, according to the + original input logical port. This is useful when there are multiple IPs + available for encapsulation. The value only has local significance and is + not meaningful between chassis. This is initialized to 0 at the beginning + of the logical + <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in + ovn/lib/logical-fields.h. --> + ingress pipeline. OVN stores this in the higher 16 bits of the Open + vSwitch extension register number 13. + </dd> + <dt>logical flow flags</dt> <dd> The logical flags are intended to handle keeping context between diff --git a/tests/ovn.at b/tests/ovn.at index 243fe0b8246c..e7f0c9681f60 100644 --- a/tests/ovn.at +++ b/tests/ovn.at @@ -30335,19 +30335,33 @@ AT_CLEANUP OVN_FOR_EACH_NORTHD([ -AT_SETUP([multiple encap ips tunnel creation]) +AT_SETUP([multiple encap ips selection based on VIF's encap_ip]) +AT_SKIP_IF([test $HAVE_SCAPY = no]) ovn_start net_add n1 +ovn-nbctl ls-add ls1 + # 2 HVs, each with 2 encap-ips. +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY. for i in 1 2; do sim_add hv$i as hv$i ovs-vsctl add-br br-phys-$j ovn_attach n1 br-phys-$j 192.168.0.${i}1 ovs-vsctl set open . external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 + + for j in 1 2; do + ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ + external_ids:iface-id=lsp$i$j \ + external_ids:encap-ip=192.168.0.$i$j \ + options:tx_pcap=hv$i/vif$i$j-tx.pcap options:rxq_pcap=hv$i/vif$i$j-rx.pcap + ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j "f0:00:00:00:00:$i$j 10.0.0.$i$j" + + done done +wait_for_ports_up check ovn-nbctl --wait=hv sync check_tunnel_port() { @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.21 check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22 check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22 +vif_to_hv() { + case $1 in dnl ( + vif[[1]]?) echo hv1 ;; dnl ( + vif[[2]]?) echo hv2 ;; dnl ( + *) AT_FAIL_IF([:]) ;; + esac +} + +# check_packet_tunnel SRC_VIF DST_VIF +# Verify that a packet from SRC_VIF to DST_VIF goes through the corresponding +# tunnel that matches the VIFs' encap_ip configurations. +check_packet_tunnel() { + local src=$1 dst=$2 + local src_mac=f0:00:00:00:00:$src + local dst_mac=f0:00:00:00:00:$dst + local src_ip=10.0.0.$src + local dst_ip=10.0.0.$dst + local local_encap_ip=192.168.0.$src + local remote_encap_ip=192.168.0.$dst + local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ + IP(dst='${dst_ip}', src='${src_ip}')/ \ + ICMP(type=8)") + hv=`vif_to_hv vif$src` + as $hv + echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> $remote_encap_ip" + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) + AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) +} + +for i in 1 2; do + for j in 1 2; do + check_packet_tunnel 1$i 2$j + done +done + OVN_CLEANUP([hv1],[hv2]) AT_CLEANUP ])