diff mbox series

[ovs-dev,3/3] ovn-controller: Support VIF-based local encap IPs selection.

Message ID 20240117054745.4027120-4-hzhou@ovn.org
State Accepted
Headers show
Series Support VIF-based local encap IPs selection. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Han Zhou Jan. 17, 2024, 5:47 a.m. UTC
Commit dd527a283cd8 partially supported multiple encap IPs. It supported
remote encap IP selection based on the destination VIF's encap_ip
configuration. This patch adds the support for selecting local encap IP
based on the source VIF's encap_ip configuration.

Co-authored-by: Lei Huang <leih@nvidia.com>
Signed-off-by: Lei Huang <leih@nvidia.com>
Signed-off-by: Han Zhou <hzhou@ovn.org>
---
 NEWS                            |   3 +
 controller/chassis.c            |   2 +-
 controller/local_data.c         |   2 +-
 controller/local_data.h         |   2 +-
 controller/ovn-controller.8.xml |  30 ++++++-
 controller/ovn-controller.c     |  49 ++++++++++++
 controller/physical.c           | 134 ++++++++++++++++++++++----------
 controller/physical.h           |   2 +
 include/ovn/logical-fields.h    |   4 +-
 ovn-architecture.7.xml          |  18 ++++-
 tests/ovn.at                    |  51 +++++++++++-
 11 files changed, 243 insertions(+), 54 deletions(-)

Comments

Ales Musil Jan. 23, 2024, 1:28 p.m. UTC | #1
On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote:

> Commit dd527a283cd8 partially supported multiple encap IPs. It supported
> remote encap IP selection based on the destination VIF's encap_ip
> configuration. This patch adds the support for selecting local encap IP
> based on the source VIF's encap_ip configuration.
>
> Co-authored-by: Lei Huang <leih@nvidia.com>
> Signed-off-by: Lei Huang <leih@nvidia.com>
> Signed-off-by: Han Zhou <hzhou@ovn.org>
> ---
>

Hi Han and Lei,

thank you for the patch, I have a couple of comments/questions down below.

 NEWS                            |   3 +
>  controller/chassis.c            |   2 +-
>  controller/local_data.c         |   2 +-
>  controller/local_data.h         |   2 +-
>  controller/ovn-controller.8.xml |  30 ++++++-
>  controller/ovn-controller.c     |  49 ++++++++++++
>  controller/physical.c           | 134 ++++++++++++++++++++++----------
>  controller/physical.h           |   2 +
>  include/ovn/logical-fields.h    |   4 +-
>  ovn-architecture.7.xml          |  18 ++++-
>  tests/ovn.at                    |  51 +++++++++++-
>  11 files changed, 243 insertions(+), 54 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index 5f267b4c64cc..5a3eed608617 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -14,6 +14,9 @@ Post v23.09.0
>    - ovn-northd-ddlog has been removed.
>    - A new LSP option "enable_router_port_acl" has been added to enable
>      conntrack for the router port whose peer is l3dgw_port if set it true.
> +  - Support selecting encapsulation IP based on the source/destination
> VIF's
> +    settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for more
> +    details.
>
>  OVN v23.09.0 - 15 Sep 2023
>  --------------------------
> diff --git a/controller/chassis.c b/controller/chassis.c
> index a6f13ccc42d5..55f2beb37674 100644
> --- a/controller/chassis.c
> +++ b/controller/chassis.c
> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg {
>
>      /* Set of encap types parsed from the 'ovn-encap-type' external-id. */
>      struct sset encap_type_set;
> -    /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. */
> +    /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */
>      struct sset encap_ip_set;
>      /* Interface type list formatted in the OVN-SB Chassis required
> format. */
>      struct ds iface_types;
> diff --git a/controller/local_data.c b/controller/local_data.c
> index a9092783958f..8606414f8728 100644
> --- a/controller/local_data.c
> +++ b/controller/local_data.c
> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap *chassis_tunnels)
>   */
>  struct chassis_tunnel *
>  chassis_tunnel_find(const struct hmap *chassis_tunnels, const char
> *chassis_id,
> -                    char *remote_encap_ip, char *local_encap_ip)
> +                    char *remote_encap_ip, const char *local_encap_ip)
>

nit: Unrelated change.

 {
>      /*
>       * If the specific encap_ip is given, look for the chassisid_ip entry,
> diff --git a/controller/local_data.h b/controller/local_data.h
> index bab95bcc3824..ca3905bd20e6 100644
> --- a/controller/local_data.h
> +++ b/controller/local_data.h
> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes(
>  struct chassis_tunnel *chassis_tunnel_find(const struct hmap
> *chassis_tunnels,
>                                             const char *chassis_id,
>                                             char *remote_encap_ip,
> -                                           char *local_encap_ip);
> +                                           const char *local_encap_ip);
>

Same as above.


>  bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels,
>                                 const char *chassis_name,
> diff --git a/controller/ovn-controller.8.xml
> b/controller/ovn-controller.8.xml
> index efa65e3fd927..5ebef048d721 100644
> --- a/controller/ovn-controller.8.xml
> +++ b/controller/ovn-controller.8.xml
> @@ -176,10 +176,32 @@
>
>        <dt><code>external_ids:ovn-encap-ip</code></dt>
>        <dd>
> -        The IP address that a chassis should use to connect to this node
> -        using encapsulation types specified by
> -        <code>external_ids:ovn-encap-type</code>. Multiple encapsulation
> IPs
> -        may be specified with a comma-separated list.
> +        <p>
> +          The IP address that a chassis should use to connect to this node
> +          using encapsulation types specified by
> +          <code>external_ids:ovn-encap-type</code>. Multiple
> encapsulation IPs
> +          may be specified with a comma-separated list.
> +        </p>
> +        <p>
> +          In scenarios where multiple encapsulation IPs are present,
> distinct
> +          tunnels are established for each remote chassis. These tunnels
> are
> +          differentiated by setting unique <code>options:local_ip</code>
> and
> +          <code>options:remote_ip</code> values in the tunnel interface.
> When
> +          transmitting a packet to a remote chassis, the selection of
> local_ip
> +          is guided by the <code>Interface:external_ids:encap-ip</code>
> from
> +          the local OVSDB, corresponding to the VIF originating the
> packet, if
> +          specified. The <code>Interface:external_ids:encap-ip</code>
> setting
> +          of the VIF is also populated to the <code>Port_Binding</code>
> +          table in the OVN SB database via the <code>encap</code> column.
> +          Consequently, when a remote chassis needs to send a packet to a
> +          port-binding associated with this VIF, it utilizes the tunnel
> with
> +          the appropriate <code>options:remote_ip</code> that matches the
> +          <code>ip</code> in <code>Port_Binding:encap</code>. This
> mechanism
> +          is particularly beneficial for chassis with multiple physical
> +          interfaces designated for tunneling, where each interface is
> +          optimized for handling specific traffic associated with
> particular
> +          VIFs.
> +        </p>
>        </dd>
>
>        <dt><code>external_ids:ovn-encap-df_default</code></dt>
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index 856e5e270822..4ab57fe970fe 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c
> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output {
>      struct physical_debug debug;
>  };
>
> +static void
> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table,
> +                size_t *n_encap_ips, const char * **encap_ips)
> +{
> +    const struct ovsrec_open_vswitch *cfg =
> +        ovsrec_open_vswitch_table_first(ovs_table);
> +    const char *chassis_id = get_ovs_chassis_id(ovs_table);
> +    const char *encap_ips_str =
> +        get_chassis_external_id_value(&cfg->external_ids, chassis_id,
> +                                      "ovn-encap-ip", NULL);
> +    struct sset encap_ip_set;
> +    sset_init(&encap_ip_set);
> +    sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");
> +
> +    /* Sort the ips so that their index is deterministic. */
> +    *encap_ips = sset_sort(&encap_ip_set);
> +
> +    /* Copy the strings so that we can destroy the sset. */
> +    for (size_t i = 0; (*encap_ips)[i]; i++) {
> +        (*encap_ips)[i] = xstrdup((*encap_ips)[i]);
> +    }
> +    *n_encap_ips = sset_count(&encap_ip_set);
> +    sset_destroy(&encap_ip_set);
> +}
> +
>

I wonder, could we store this array or maybe preferably svec in
"en_non_vif_data" I-P node? This way it would be handled in the node and we
could avoid the destroy calls after any pflow processing WDYT?


>  static void init_physical_ctx(struct engine_node *node,
>                                struct ed_type_runtime_data *rt_data,
>                                struct ed_type_non_vif_data *non_vif_data,
> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node
> *node,
>          engine_get_input_data("ct_zones", node);
>      struct simap *ct_zones = &ct_zones_data->current;
>
> +    parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips);
>      p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name;
>      p_ctx->sbrec_port_binding_by_datapath =
> sbrec_port_binding_by_datapath;
>      p_ctx->port_binding_table = port_binding_table;
> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct engine_node
> *node,
>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
>
> +
> +
>
>
nit: Unrelated change.


>      struct controller_engine_ctx *ctrl_ctx =
> engine_get_context()->client_ctx;
>      p_ctx->if_mgr = ctrl_ctx->if_mgr;
>
>      pflow_output_get_debug(node, &p_ctx->debug);
>  }
>
> +static void
> +destroy_physical_ctx(struct physical_ctx *p_ctx)
> +{
> +    for (size_t i = 0; i < p_ctx->n_encap_ips; i++) {
> +        free((char *)(p_ctx->encap_ips[i]));
> +    }
> +    free(p_ctx->encap_ips);
> +}
> +
>  static void *
>  en_pflow_output_init(struct engine_node *node OVS_UNUSED,
>                               struct engine_arg *arg OVS_UNUSED)
> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, void
> *data)
>      struct physical_ctx p_ctx;
>      init_physical_ctx(node, rt_data, non_vif_data, &p_ctx);
>      physical_run(&p_ctx, pflow_table);
> +    destroy_physical_ctx(&p_ctx);
>
>      engine_set_node_state(node, EN_UPDATED);
>  }
> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct
> engine_node *node,
>                  bool removed = sbrec_port_binding_is_deleted(binding);
>                  if (!physical_handle_flows_for_lport(binding, removed,
> &p_ctx,
>                                                       &pfo->flow_table)) {
> +                    destroy_physical_ctx(&p_ctx);
>                      return false;
>                  }
>              }
> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct
> engine_node *node,
>              bool removed = sbrec_port_binding_is_deleted(pb);
>              if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
>                                                   &pfo->flow_table)) {
> +                destroy_physical_ctx(&p_ctx);
>                  return false;
>              }
>          }
>          engine_set_node_state(node, EN_UPDATED);
>      }
> +    destroy_physical_ctx(&p_ctx);
>      return true;
>  }
>
> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct
> engine_node *node,
>          bool removed = sbrec_port_binding_is_deleted(pb);
>          if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
>                                               &pfo->flow_table)) {
> +            destroy_physical_ctx(&p_ctx);
>              return false;
>          }
>      }
>
>      engine_set_node_state(node, EN_UPDATED);
> +    destroy_physical_ctx(&p_ctx);
>      return true;
>  }
>
> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct
> engine_node *node, void *data)
>      physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table);
>
>      engine_set_node_state(node, EN_UPDATED);
> +    destroy_physical_ctx(&p_ctx);
>      return true;
>  }
>
> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct engine_node
> *node, void *data)
>          if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) {
>              /* Fall back to full recompute when a local datapath
>               * is added or deleted. */
> +            destroy_physical_ctx(&p_ctx);
>              return false;
>          }
>
> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct
> engine_node *node, void *data)
>                  lport->tracked_type == TRACKED_RESOURCE_REMOVED ? true:
> false;
>              if (!physical_handle_flows_for_lport(lport->pb, removed,
> &p_ctx,
>                                                   &pfo->flow_table)) {
> +                destroy_physical_ctx(&p_ctx);
>                  return false;
>              }
>          }
>      }
>
>      engine_set_node_state(node, EN_UPDATED);
> +    destroy_physical_ctx(&p_ctx);
>      return true;
>  }
>
> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct
> engine_node *node, void *data)
>          if (pb) {
>              if (!physical_handle_flows_for_lport(pb, false, &p_ctx,
>                                                   &pfo->flow_table)) {
> +                destroy_physical_ctx(&p_ctx);
>                  return false;
>              }
>              tag_port_as_activated_in_engine(pp);
>          }
>      }
>      engine_set_node_state(node, EN_UPDATED);
> +    destroy_physical_ctx(&p_ctx);
>      return true;
>  }
>
> diff --git a/controller/physical.c b/controller/physical.c
> index e93bfbd2cffb..c192aed751d5 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -71,6 +71,8 @@ struct tunnel {
>  static void
>  load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
>                                const struct zone_ids *zone_ids,
> +                              size_t n_encap_ips,
> +                              const char **encap_ips,
>                                struct ofpbuf *ofpacts_p);
>  static int64_t get_vxlan_port_key(int64_t port_key);
>
> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf
> *ofpacts)
>   */
>  static struct chassis_tunnel *
>  get_port_binding_tun(const struct sbrec_encap *remote_encap,
> -                     const struct sbrec_encap *local_encap,
>                       const struct sbrec_chassis *chassis,
> -                     const struct hmap *chassis_tunnels)
> +                     const struct hmap *chassis_tunnels,
> +                     const char *local_encap_ip)
>  {
>      struct chassis_tunnel *tun = NULL;
>      tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
>                                remote_encap ? remote_encap->ip : NULL,
> -                              local_encap ? local_encap->ip : NULL);
> +                              local_encap_ip);
>
>      if (!tun) {
>          tun = chassis_tunnel_find(chassis_tunnels, chassis->name, NULL,
> NULL);
> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct
> sbrec_port_binding *pb,
>  static struct ovs_list *
>  get_remote_tunnels(const struct sbrec_port_binding *binding,
>                     const struct sbrec_chassis *chassis,
> -                   const struct hmap *chassis_tunnels)
> +                   const struct hmap *chassis_tunnels,
> +                   const char *local_encap_ip)
>  {
>      const struct chassis_tunnel *tun;
>
> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding
> *binding,
>      ovs_list_init(tunnels);
>
>      if (binding->chassis && binding->chassis != chassis) {
> -        tun = get_port_binding_tun(binding->encap, NULL, binding->chassis,
> -                                   chassis_tunnels);
> +        tun = get_port_binding_tun(binding->encap, binding->chassis,
> +                chassis_tunnels, local_encap_ip);
>          if (!tun) {
>              static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
>              VLOG_WARN_RL(
> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding
> *binding,
>          }
>          const struct sbrec_encap *additional_encap;
>          additional_encap = find_additional_encap_for_chassis(binding,
> chassis);
> -        tun = get_port_binding_tun(additional_encap, NULL,
> +        tun = get_port_binding_tun(additional_encap,
>                                     binding->additional_chassis[i],
> -                                   chassis_tunnels);
> +                                   chassis_tunnels, local_encap_ip);
>          if (!tun) {
>              static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
>              VLOG_WARN_RL(
> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct
> sbrec_port_binding *binding,
>                                   struct ofpbuf *ofpacts_p,
>                                   const struct sbrec_chassis *chassis,
>                                   const struct hmap *chassis_tunnels,
> +                                 size_t n_encap_ips,
> +                                 const char **encap_ips,
>                                   struct ovn_desired_flow_table
> *flow_table)
>  {
>      /* Setup encapsulation */
> -    struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> -                                               chassis_tunnels);
> -    if (!ovs_list_is_empty(tuns)) {
> -        bool is_vtep_port = !strcmp(binding->type, "vtep");
> -        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for
> ARP/ND
> -         * responder in L3 networks. */
> -        if (is_vtep_port) {
> -            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
> ofpacts_p);
> -        }
> +    for (size_t i = 0; i < n_encap_ips; i++) {
> +        struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p);
>
+
> +        match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16,
> +                             (uint32_t) 0xFFFF << 16);
> +        struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> +                                                   chassis_tunnels,
> +                                                   encap_ips[i]);
> +        if (!ovs_list_is_empty(tuns)) {
> +            bool is_vtep_port = !strcmp(binding->type, "vtep");
> +            /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for
> ARP/ND
> +             * responder in L3 networks. */
> +            if (is_vtep_port) {
> +                put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
> +                         ofpacts_clone);
> +            }
>
> -        struct tunnel *tun;
> -        LIST_FOR_EACH (tun, list_node, tuns) {
> -            put_encapsulation(mff_ovn_geneve, tun->tun,
> -                              binding->datapath, port_key, is_vtep_port,
> -                              ofpacts_p);
> -            ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport;
> +            struct tunnel *tun;
> +            LIST_FOR_EACH (tun, list_node, tuns) {
> +                put_encapsulation(mff_ovn_geneve, tun->tun,
> +                                  binding->datapath, port_key,
> is_vtep_port,
> +                                  ofpacts_clone);
> +                ofpact_put_OUTPUT(ofpacts_clone)->port = tun->tun->ofport;
> +            }
> +            put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone);
> +            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
> +                            binding->header_.uuid.parts[0], match,
> +                            ofpacts_clone, &binding->header_.uuid);
>          }
> -        put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
> -        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
> -                        binding->header_.uuid.parts[0], match, ofpacts_p,
> -                        &binding->header_.uuid);
> -    }
> -    struct tunnel *tun_elem;
> -    LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
> -        free(tun_elem);
> +        struct tunnel *tun_elem;
> +        LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
> +            free(tun_elem);
> +        }
> +        free(tuns);
> +
> +        ofpbuf_delete(ofpacts_clone);
>      }
> -    free(tuns);
>  }
>
>  static void
> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap
> *ct_zones,
>          if (tag) {
>              ofpact_put_STRIP_VLAN(ofpacts_p);
>          }
> -        load_logical_ingress_metadata(localnet_port, &zone_ids,
> ofpacts_p);
> +        load_logical_ingress_metadata(localnet_port, &zone_ids, 0, NULL,
> +                                      ofpacts_p);
>          replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p);
>          replace_mac->mac = router_port_mac;
>
> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids,
> struct ofpbuf *ofpacts_p)
>  {
>      if (zone_ids) {
>          if (zone_ids->ct) {
> -            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
> +            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
>          }
>          if (zone_ids->dnat) {
>              put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key,
>      }
>  }
>
> +static size_t
> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips,
> +               const char *ip)
> +{
> +    for (size_t i = 0; i < n_encap_ips; i++) {
> +        if (!strcmp(ip, encap_ips[i])) {
> +            return i;
> +        }
> +    }
> +    return 0;
> +}
> +
>  static void
>  load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
>                                const struct zone_ids *zone_ids,
> +                              size_t n_encap_ips,
> +                              const char **encap_ips,
>                                struct ofpbuf *ofpacts_p)
>  {
>      put_zones_ofpacts(zone_ids, ofpacts_p);
> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct
> sbrec_port_binding *binding,
>      uint32_t port_key = binding->tunnel_key;
>      put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p);
>      put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p);
> +
> +    /* Default encap_id 0. */
> +    size_t encap_id = 0;
> +    if (encap_ips && binding->encap) {
> +        encap_id = encap_ip_to_id(n_encap_ips, encap_ips,
> +                                  binding->encap->ip);
> +    }
> +    put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p);
>  }
>
>  static const struct sbrec_port_binding *
> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct
> sbrec_port_binding *binding,
>      match_set_dl_type(&match, htons(ETH_TYPE_RARP));
>      match_set_in_port(&match, ofport);
>
> -    load_logical_ingress_metadata(binding, zone_ids, &ofpacts);
> +    load_logical_ingress_metadata(binding, zone_ids, 0, NULL, &ofpacts);
>
>      encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
>                           NX_CTLR_NO_METER, &ofpacts);
> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports(
>      }
>
>      struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> -                                               chassis_tunnels);
> +                                               chassis_tunnels, NULL);
>      if (ovs_list_is_empty(tuns)) {
>          free(tuns);
>          return;
> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
>                        const struct sbrec_chassis *chassis,
>                        const struct physical_debug *debug,
>                        const struct if_status_mgr *if_mgr,
> +                      size_t n_encap_ips,
> +                      const char **encap_ips,
>                        struct ovn_desired_flow_table *flow_table,
>                        struct ofpbuf *ofpacts_p)
>  {
> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
>          ofpact_put_CT_CLEAR(ofpacts_p);
>          put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
>          put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
> -        put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
> +        put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
>          struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
> -        load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
> +        load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips,
> +                                      encap_ips, ofpacts_p);
>          put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
>          put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
>          for (int i = 0; i < MFF_N_LOG_REGS; i++) {
> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
>           * as we're going to remove this with ofpbuf_pull() later. */
>          uint32_t ofpacts_orig_size = ofpacts_p->size;
>
> -        load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p);
> +        load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips,
> +                                      encap_ips, ofpacts_p);
>
>          if (!strcmp(binding->type, "localport")) {
>              /* mark the packet as incoming from a localport */
> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
>      } else {
>          put_remote_port_redirect_overlay(
>              binding, mff_ovn_geneve, port_key, &match, ofpacts_p,
> -            chassis, chassis_tunnels, flow_table);
> +            chassis, chassis_tunnels, n_encap_ips, encap_ips, flow_table);
>      }
>  out:
>      if (ha_ch_ordered) {
> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
>
>          int zone_id = simap_get(ct_zones, port->logical_port);
>          if (zone_id) {
> -            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts);
> +            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts);
>          }
>
>          const char *lport_name = (port->parent_port &&
> *port->parent_port) ?
> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx
> *p_ctx,
>                            p_ctx->patch_ofports,
>                            p_ctx->chassis_tunnels,
>                            pb, p_ctx->chassis, &p_ctx->debug,
> -                          p_ctx->if_mgr, flow_table, &ofpacts);
> +                          p_ctx->if_mgr,
> +                          p_ctx->n_encap_ips,
> +                          p_ctx->encap_ips,
> +                          flow_table, &ofpacts);
>      ofpbuf_uninit(&ofpacts);
>  }
>
> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx,
>                                p_ctx->patch_ofports,
>                                p_ctx->chassis_tunnels, binding,
>                                p_ctx->chassis, &p_ctx->debug,
> -                              p_ctx->if_mgr, flow_table, &ofpacts);
> +                              p_ctx->if_mgr,
> +                              p_ctx->n_encap_ips,
> +                              p_ctx->encap_ips,
> +                              flow_table, &ofpacts);
>      }
>
>      /* Handle output to multicast groups, in tables 40 and 41. */
> diff --git a/controller/physical.h b/controller/physical.h
> index 1f1ed55efa16..7fe8ee3c18ed 100644
> --- a/controller/physical.h
> +++ b/controller/physical.h
> @@ -66,6 +66,8 @@ struct physical_ctx {
>      struct shash *local_bindings;
>      struct simap *patch_ofports;
>      struct hmap *chassis_tunnels;
> +    size_t n_encap_ips;
> +    const char **encap_ips;
>      struct physical_debug debug;
>  };
>
> diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h
> index 272277ec4fa0..d3455a462a0d 100644
> --- a/include/ovn/logical-fields.h
> +++ b/include/ovn/logical-fields.h
> @@ -37,7 +37,9 @@ enum ovn_controller_event {
>  #define MFF_LOG_SNAT_ZONE  MFF_REG12  /* conntrack snat zone for gateway
> router
>                                         * (32 bits). */
>  #define MFF_LOG_CT_ZONE    MFF_REG13  /* Logical conntrack zone for lports
> -                                       * (32 bits). */
> +                                       * (0..15 of the 32 bits). */
> +#define MFF_LOG_ENCAP_ID   MFF_REG13  /* Encap ID for lports
> +                                       * (16..31 of the 32 bits). */
>  #define MFF_LOG_INPORT     MFF_REG14  /* Logical input port (32 bits). */
>  #define MFF_LOG_OUTPORT    MFF_REG15  /* Logical output port (32 bits). */
>
> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
> index 96294fe2b795..bfd8680cedfc 100644
> --- a/ovn-architecture.7.xml
> +++ b/ovn-architecture.7.xml
> @@ -1247,8 +1247,8 @@
>        chassis.  This is initialized to 0 at the beginning of the logical
>          <!-- Keep the following in sync with MFF_LOG_CT_ZONE in
>               ovn/lib/logical-fields.h. -->
> -      ingress pipeline.  OVN stores this in Open vSwitch extension
> register
> -      number 13.
> +      ingress pipeline.  OVN stores this in the lower 16 bits of the Open
> +      vSwitch extension register number 13.
>      </dd>
>
>      <dt>conntrack zone fields for routers</dt>
> @@ -1263,6 +1263,20 @@
>        traffic (for SNATing) in Open vSwitch extension register number 12.
>      </dd>
>
> +    <dt>Encap ID for logical ports</dt>
> +    <dd>
> +      A field that records an ID that indicates which encapsulation IP
> should
> +      be used when sending packets to a remote chassis, according to the
> +      original input logical port. This is useful when there are multiple
> IPs
> +      available for encapsulation. The value only has local significance
> and is
> +      not meaningful between chassis. This is initialized to 0 at the
> beginning
> +      of the logical
> +        <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in
> +             ovn/lib/logical-fields.h. -->
> +      ingress pipeline.  OVN stores this in the higher 16 bits of the Open
> +      vSwitch extension register number 13.
> +    </dd>
> +
>      <dt>logical flow flags</dt>
>      <dd>
>        The logical flags are intended to handle keeping context between
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 243fe0b8246c..e7f0c9681f60 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -30335,19 +30335,33 @@ AT_CLEANUP
>
>
>  OVN_FOR_EACH_NORTHD([
> -AT_SETUP([multiple encap ips tunnel creation])
> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip])
> +AT_SKIP_IF([test $HAVE_SCAPY = no])
>  ovn_start
>  net_add n1
>
> +ovn-nbctl ls-add ls1
> +
>  # 2 HVs, each with 2 encap-ips.
> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY.
>  for i in 1 2; do
>      sim_add hv$i
>      as hv$i
>      ovs-vsctl add-br br-phys-$j
>      ovn_attach n1 br-phys-$j 192.168.0.${i}1
>      ovs-vsctl set open .
> external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2
> +
> +    for j in 1 2; do
> +        ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \
> +            external_ids:iface-id=lsp$i$j \
> +            external_ids:encap-ip=192.168.0.$i$j \
> +            options:tx_pcap=hv$i/vif$i$j-tx.pcap
> options:rxq_pcap=hv$i/vif$i$j-rx.pcap
> +        ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j
> "f0:00:00:00:00:$i$j 10.0.0.$i$j"
> +
> +    done
>  done
>
> +wait_for_ports_up
>  check ovn-nbctl --wait=hv sync
>
>  check_tunnel_port() {
> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12
> %192.168.0.21
>  check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22
>  check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22
>
> +vif_to_hv() {
> +    case $1 in dnl (
> +        vif[[1]]?) echo hv1 ;; dnl (
> +        vif[[2]]?) echo hv2 ;; dnl (
> +        *) AT_FAIL_IF([:]) ;;
> +    esac
> +}
> +
> +# check_packet_tunnel SRC_VIF DST_VIF
> +# Verify that a packet from SRC_VIF to DST_VIF goes through the
> corresponding
> +# tunnel that matches the VIFs' encap_ip configurations.
> +check_packet_tunnel() {
> +    local src=$1 dst=$2
> +    local src_mac=f0:00:00:00:00:$src
> +    local dst_mac=f0:00:00:00:00:$dst
> +    local src_ip=10.0.0.$src
> +    local dst_ip=10.0.0.$dst
> +    local local_encap_ip=192.168.0.$src
> +    local remote_encap_ip=192.168.0.$dst
> +    local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \
> +                            IP(dst='${dst_ip}', src='${src_ip}')/ \
> +                            ICMP(type=8)")
> +    hv=`vif_to_hv vif$src`
> +    as $hv
> +    echo "vif$src -> vif$dst should go through tunnel $local_encap_ip ->
> $remote_encap_ip"
> +    tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface
> options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip)
> +    AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src
> $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport])
> +}
> +
> +for i in 1 2; do
> +    for j in 1 2; do
> +        check_packet_tunnel 1$i 2$j
> +    done
> +done
> +
>  OVN_CLEANUP([hv1],[hv2])
>  AT_CLEANUP
>  ])
> --
> 2.38.1
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Thanks,
Ales
Han Zhou Jan. 26, 2024, 3:07 a.m. UTC | #2
On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote:
>
>
>
> On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote:
>>
>> Commit dd527a283cd8 partially supported multiple encap IPs. It supported
>> remote encap IP selection based on the destination VIF's encap_ip
>> configuration. This patch adds the support for selecting local encap IP
>> based on the source VIF's encap_ip configuration.
>>
>> Co-authored-by: Lei Huang <leih@nvidia.com>
>> Signed-off-by: Lei Huang <leih@nvidia.com>
>> Signed-off-by: Han Zhou <hzhou@ovn.org>
>> ---
>
>
> Hi Han and Lei,
>
> thank you for the patch, I have a couple of comments/questions down below.


Thanks Ales.

>
>
>>  NEWS                            |   3 +
>>  controller/chassis.c            |   2 +-
>>  controller/local_data.c         |   2 +-
>>  controller/local_data.h         |   2 +-
>>  controller/ovn-controller.8.xml |  30 ++++++-
>>  controller/ovn-controller.c     |  49 ++++++++++++
>>  controller/physical.c           | 134 ++++++++++++++++++++++----------
>>  controller/physical.h           |   2 +
>>  include/ovn/logical-fields.h    |   4 +-
>>  ovn-architecture.7.xml          |  18 ++++-
>>  tests/ovn.at                    |  51 +++++++++++-
>>  11 files changed, 243 insertions(+), 54 deletions(-)
>>
>> diff --git a/NEWS b/NEWS
>> index 5f267b4c64cc..5a3eed608617 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -14,6 +14,9 @@ Post v23.09.0
>>    - ovn-northd-ddlog has been removed.
>>    - A new LSP option "enable_router_port_acl" has been added to enable
>>      conntrack for the router port whose peer is l3dgw_port if set it
true.
>> +  - Support selecting encapsulation IP based on the source/destination
VIF's
>> +    settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for more
>> +    details.
>>
>>  OVN v23.09.0 - 15 Sep 2023
>>  --------------------------
>> diff --git a/controller/chassis.c b/controller/chassis.c
>> index a6f13ccc42d5..55f2beb37674 100644
>> --- a/controller/chassis.c
>> +++ b/controller/chassis.c
>> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg {
>>
>>      /* Set of encap types parsed from the 'ovn-encap-type' external-id.
*/
>>      struct sset encap_type_set;
>> -    /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. */
>> +    /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */
>>      struct sset encap_ip_set;
>>      /* Interface type list formatted in the OVN-SB Chassis required
format. */
>>      struct ds iface_types;
>> diff --git a/controller/local_data.c b/controller/local_data.c
>> index a9092783958f..8606414f8728 100644
>> --- a/controller/local_data.c
>> +++ b/controller/local_data.c
>> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap *chassis_tunnels)
>>   */
>>  struct chassis_tunnel *
>>  chassis_tunnel_find(const struct hmap *chassis_tunnels, const char
*chassis_id,
>> -                    char *remote_encap_ip, char *local_encap_ip)
>> +                    char *remote_encap_ip, const char *local_encap_ip)
>
>
> nit: Unrelated change.


Ack

>
>
>>  {
>>      /*
>>       * If the specific encap_ip is given, look for the chassisid_ip
entry,
>> diff --git a/controller/local_data.h b/controller/local_data.h
>> index bab95bcc3824..ca3905bd20e6 100644
>> --- a/controller/local_data.h
>> +++ b/controller/local_data.h
>> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes(
>>  struct chassis_tunnel *chassis_tunnel_find(const struct hmap
*chassis_tunnels,
>>                                             const char *chassis_id,
>>                                             char *remote_encap_ip,
>> -                                           char *local_encap_ip);
>> +                                           const char *local_encap_ip);
>
>
> Same as above.


Ack

>
>
>>
>>  bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels,
>>                                 const char *chassis_name,
>> diff --git a/controller/ovn-controller.8.xml
b/controller/ovn-controller.8.xml
>> index efa65e3fd927..5ebef048d721 100644
>> --- a/controller/ovn-controller.8.xml
>> +++ b/controller/ovn-controller.8.xml
>> @@ -176,10 +176,32 @@
>>
>>        <dt><code>external_ids:ovn-encap-ip</code></dt>
>>        <dd>
>> -        The IP address that a chassis should use to connect to this node
>> -        using encapsulation types specified by
>> -        <code>external_ids:ovn-encap-type</code>. Multiple
encapsulation IPs
>> -        may be specified with a comma-separated list.
>> +        <p>
>> +          The IP address that a chassis should use to connect to this
node
>> +          using encapsulation types specified by
>> +          <code>external_ids:ovn-encap-type</code>. Multiple
encapsulation IPs
>> +          may be specified with a comma-separated list.
>> +        </p>
>> +        <p>
>> +          In scenarios where multiple encapsulation IPs are present,
distinct
>> +          tunnels are established for each remote chassis. These
tunnels are
>> +          differentiated by setting unique
<code>options:local_ip</code> and
>> +          <code>options:remote_ip</code> values in the tunnel
interface. When
>> +          transmitting a packet to a remote chassis, the selection of
local_ip
>> +          is guided by the <code>Interface:external_ids:encap-ip</code>
from
>> +          the local OVSDB, corresponding to the VIF originating the
packet, if
>> +          specified. The <code>Interface:external_ids:encap-ip</code>
setting
>> +          of the VIF is also populated to the <code>Port_Binding</code>
>> +          table in the OVN SB database via the <code>encap</code>
column.
>> +          Consequently, when a remote chassis needs to send a packet to
a
>> +          port-binding associated with this VIF, it utilizes the tunnel
with
>> +          the appropriate <code>options:remote_ip</code> that matches
the
>> +          <code>ip</code> in <code>Port_Binding:encap</code>. This
mechanism
>> +          is particularly beneficial for chassis with multiple physical
>> +          interfaces designated for tunneling, where each interface is
>> +          optimized for handling specific traffic associated with
particular
>> +          VIFs.
>> +        </p>
>>        </dd>
>>
>>        <dt><code>external_ids:ovn-encap-df_default</code></dt>
>> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
>> index 856e5e270822..4ab57fe970fe 100644
>> --- a/controller/ovn-controller.c
>> +++ b/controller/ovn-controller.c
>> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output {
>>      struct physical_debug debug;
>>  };
>>
>> +static void
>> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table,
>> +                size_t *n_encap_ips, const char * **encap_ips)
>> +{
>> +    const struct ovsrec_open_vswitch *cfg =
>> +        ovsrec_open_vswitch_table_first(ovs_table);
>> +    const char *chassis_id = get_ovs_chassis_id(ovs_table);
>> +    const char *encap_ips_str =
>> +        get_chassis_external_id_value(&cfg->external_ids, chassis_id,
>> +                                      "ovn-encap-ip", NULL);
>> +    struct sset encap_ip_set;
>> +    sset_init(&encap_ip_set);
>> +    sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");
>> +
>> +    /* Sort the ips so that their index is deterministic. */
>> +    *encap_ips = sset_sort(&encap_ip_set);
>> +
>> +    /* Copy the strings so that we can destroy the sset. */
>> +    for (size_t i = 0; (*encap_ips)[i]; i++) {
>> +        (*encap_ips)[i] = xstrdup((*encap_ips)[i]);
>> +    }
>> +    *n_encap_ips = sset_count(&encap_ip_set);
>> +    sset_destroy(&encap_ip_set);
>> +}
>> +
>
>
> I wonder, could we store this array or maybe preferably svec in
"en_non_vif_data" I-P node? This way it would be handled in the node and we
could avoid the destroy calls after any pflow processing WDYT?


Yes we could do the same in en_non_vif_data, but I think it is not really
necessary and it seems more straightforward just parsing it here, because:
1. We don't need I-P for this ovn-encap-ip configuration change, so we
don't have to persist this data.
2. It should be very rare to have more than 5 (or even 3) encap IPs per
node, so the list should be very small and the cost of this parsing (and
sset construct/destroy) is negligible.
Using svec is also a valid option, but the sset (and array) is used here
just to reuse the sset_from_delimited_string and sset_sort for convenience.
I noticed that the sset_init() call can in fact be removed because
sset_from_delimited_string already does that. I can remove it.
Does this sound reasonable to you?

Thanks,
Han

>
>>
>>  static void init_physical_ctx(struct engine_node *node,
>>                                struct ed_type_runtime_data *rt_data,
>>                                struct ed_type_non_vif_data *non_vif_data,
>> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node
*node,
>>          engine_get_input_data("ct_zones", node);
>>      struct simap *ct_zones = &ct_zones_data->current;
>>
>> +    parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips);
>>      p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name;
>>      p_ctx->sbrec_port_binding_by_datapath =
sbrec_port_binding_by_datapath;
>>      p_ctx->port_binding_table = port_binding_table;
>> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct engine_node
*node,
>>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
>>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
>>
>> +
>> +
>>
>
> nit: Unrelated change.


Ack
>
>
>>
>>      struct controller_engine_ctx *ctrl_ctx =
engine_get_context()->client_ctx;
>>      p_ctx->if_mgr = ctrl_ctx->if_mgr;
>>
>>      pflow_output_get_debug(node, &p_ctx->debug);
>>  }
>>
>> +static void
>> +destroy_physical_ctx(struct physical_ctx *p_ctx)
>> +{
>> +    for (size_t i = 0; i < p_ctx->n_encap_ips; i++) {
>> +        free((char *)(p_ctx->encap_ips[i]));
>> +    }
>> +    free(p_ctx->encap_ips);
>> +}
>> +
>>  static void *
>>  en_pflow_output_init(struct engine_node *node OVS_UNUSED,
>>                               struct engine_arg *arg OVS_UNUSED)
>> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node, void
*data)
>>      struct physical_ctx p_ctx;
>>      init_physical_ctx(node, rt_data, non_vif_data, &p_ctx);
>>      physical_run(&p_ctx, pflow_table);
>> +    destroy_physical_ctx(&p_ctx);
>>
>>      engine_set_node_state(node, EN_UPDATED);
>>  }
>> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct
engine_node *node,
>>                  bool removed = sbrec_port_binding_is_deleted(binding);
>>                  if (!physical_handle_flows_for_lport(binding, removed,
&p_ctx,
>>                                                       &pfo->flow_table))
{
>> +                    destroy_physical_ctx(&p_ctx);
>>                      return false;
>>                  }
>>              }
>> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct
engine_node *node,
>>              bool removed = sbrec_port_binding_is_deleted(pb);
>>              if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
>>                                                   &pfo->flow_table)) {
>> +                destroy_physical_ctx(&p_ctx);
>>                  return false;
>>              }
>>          }
>>          engine_set_node_state(node, EN_UPDATED);
>>      }
>> +    destroy_physical_ctx(&p_ctx);
>>      return true;
>>  }
>>
>> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct
engine_node *node,
>>          bool removed = sbrec_port_binding_is_deleted(pb);
>>          if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
>>                                               &pfo->flow_table)) {
>> +            destroy_physical_ctx(&p_ctx);
>>              return false;
>>          }
>>      }
>>
>>      engine_set_node_state(node, EN_UPDATED);
>> +    destroy_physical_ctx(&p_ctx);
>>      return true;
>>  }
>>
>> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct
engine_node *node, void *data)
>>      physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table);
>>
>>      engine_set_node_state(node, EN_UPDATED);
>> +    destroy_physical_ctx(&p_ctx);
>>      return true;
>>  }
>>
>> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct
engine_node *node, void *data)
>>          if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) {
>>              /* Fall back to full recompute when a local datapath
>>               * is added or deleted. */
>> +            destroy_physical_ctx(&p_ctx);
>>              return false;
>>          }
>>
>> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct
engine_node *node, void *data)
>>                  lport->tracked_type == TRACKED_RESOURCE_REMOVED ? true:
false;
>>              if (!physical_handle_flows_for_lport(lport->pb, removed,
&p_ctx,
>>                                                   &pfo->flow_table)) {
>> +                destroy_physical_ctx(&p_ctx);
>>                  return false;
>>              }
>>          }
>>      }
>>
>>      engine_set_node_state(node, EN_UPDATED);
>> +    destroy_physical_ctx(&p_ctx);
>>      return true;
>>  }
>>
>> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct
engine_node *node, void *data)
>>          if (pb) {
>>              if (!physical_handle_flows_for_lport(pb, false, &p_ctx,
>>                                                   &pfo->flow_table)) {
>> +                destroy_physical_ctx(&p_ctx);
>>                  return false;
>>              }
>>              tag_port_as_activated_in_engine(pp);
>>          }
>>      }
>>      engine_set_node_state(node, EN_UPDATED);
>> +    destroy_physical_ctx(&p_ctx);
>>      return true;
>>  }
>>
>> diff --git a/controller/physical.c b/controller/physical.c
>> index e93bfbd2cffb..c192aed751d5 100644
>> --- a/controller/physical.c
>> +++ b/controller/physical.c
>> @@ -71,6 +71,8 @@ struct tunnel {
>>  static void
>>  load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
>>                                const struct zone_ids *zone_ids,
>> +                              size_t n_encap_ips,
>> +                              const char **encap_ips,
>>                                struct ofpbuf *ofpacts_p);
>>  static int64_t get_vxlan_port_key(int64_t port_key);
>>
>> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf
*ofpacts)
>>   */
>>  static struct chassis_tunnel *
>>  get_port_binding_tun(const struct sbrec_encap *remote_encap,
>> -                     const struct sbrec_encap *local_encap,
>>                       const struct sbrec_chassis *chassis,
>> -                     const struct hmap *chassis_tunnels)
>> +                     const struct hmap *chassis_tunnels,
>> +                     const char *local_encap_ip)
>>  {
>>      struct chassis_tunnel *tun = NULL;
>>      tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
>>                                remote_encap ? remote_encap->ip : NULL,
>> -                              local_encap ? local_encap->ip : NULL);
>> +                              local_encap_ip);
>>
>>      if (!tun) {
>>          tun = chassis_tunnel_find(chassis_tunnels, chassis->name, NULL,
NULL);
>> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct
sbrec_port_binding *pb,
>>  static struct ovs_list *
>>  get_remote_tunnels(const struct sbrec_port_binding *binding,
>>                     const struct sbrec_chassis *chassis,
>> -                   const struct hmap *chassis_tunnels)
>> +                   const struct hmap *chassis_tunnels,
>> +                   const char *local_encap_ip)
>>  {
>>      const struct chassis_tunnel *tun;
>>
>> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding
*binding,
>>      ovs_list_init(tunnels);
>>
>>      if (binding->chassis && binding->chassis != chassis) {
>> -        tun = get_port_binding_tun(binding->encap, NULL,
binding->chassis,
>> -                                   chassis_tunnels);
>> +        tun = get_port_binding_tun(binding->encap, binding->chassis,
>> +                chassis_tunnels, local_encap_ip);
>>          if (!tun) {
>>              static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5,
1);
>>              VLOG_WARN_RL(
>> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding
*binding,
>>          }
>>          const struct sbrec_encap *additional_encap;
>>          additional_encap = find_additional_encap_for_chassis(binding,
chassis);
>> -        tun = get_port_binding_tun(additional_encap, NULL,
>> +        tun = get_port_binding_tun(additional_encap,
>>                                     binding->additional_chassis[i],
>> -                                   chassis_tunnels);
>> +                                   chassis_tunnels, local_encap_ip);
>>          if (!tun) {
>>              static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5,
1);
>>              VLOG_WARN_RL(
>> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct
sbrec_port_binding *binding,
>>                                   struct ofpbuf *ofpacts_p,
>>                                   const struct sbrec_chassis *chassis,
>>                                   const struct hmap *chassis_tunnels,
>> +                                 size_t n_encap_ips,
>> +                                 const char **encap_ips,
>>                                   struct ovn_desired_flow_table
*flow_table)
>>  {
>>      /* Setup encapsulation */
>> -    struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
>> -                                               chassis_tunnels);
>> -    if (!ovs_list_is_empty(tuns)) {
>> -        bool is_vtep_port = !strcmp(binding->type, "vtep");
>> -        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for
ARP/ND
>> -         * responder in L3 networks. */
>> -        if (is_vtep_port) {
>> -            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
ofpacts_p);
>> -        }
>> +    for (size_t i = 0; i < n_encap_ips; i++) {
>> +        struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p);
>>
>> +
>> +        match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i <<
16,
>> +                             (uint32_t) 0xFFFF << 16);
>> +        struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
>> +                                                   chassis_tunnels,
>> +                                                   encap_ips[i]);
>> +        if (!ovs_list_is_empty(tuns)) {
>> +            bool is_vtep_port = !strcmp(binding->type, "vtep");
>> +            /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check
for ARP/ND
>> +             * responder in L3 networks. */
>> +            if (is_vtep_port) {
>> +                put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
>> +                         ofpacts_clone);
>> +            }
>>
>> -        struct tunnel *tun;
>> -        LIST_FOR_EACH (tun, list_node, tuns) {
>> -            put_encapsulation(mff_ovn_geneve, tun->tun,
>> -                              binding->datapath, port_key, is_vtep_port,
>> -                              ofpacts_p);
>> -            ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport;
>> +            struct tunnel *tun;
>> +            LIST_FOR_EACH (tun, list_node, tuns) {
>> +                put_encapsulation(mff_ovn_geneve, tun->tun,
>> +                                  binding->datapath, port_key,
is_vtep_port,
>> +                                  ofpacts_clone);
>> +                ofpact_put_OUTPUT(ofpacts_clone)->port =
tun->tun->ofport;
>> +            }
>> +            put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone);
>> +            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
>> +                            binding->header_.uuid.parts[0], match,
>> +                            ofpacts_clone, &binding->header_.uuid);
>>          }
>> -        put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
>> -        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
>> -                        binding->header_.uuid.parts[0], match,
ofpacts_p,
>> -                        &binding->header_.uuid);
>> -    }
>> -    struct tunnel *tun_elem;
>> -    LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
>> -        free(tun_elem);
>> +        struct tunnel *tun_elem;
>> +        LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
>> +            free(tun_elem);
>> +        }
>> +        free(tuns);
>> +
>> +        ofpbuf_delete(ofpacts_clone);
>>      }
>> -    free(tuns);
>>  }
>>
>>  static void
>> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap
*ct_zones,
>>          if (tag) {
>>              ofpact_put_STRIP_VLAN(ofpacts_p);
>>          }
>> -        load_logical_ingress_metadata(localnet_port, &zone_ids,
ofpacts_p);
>> +        load_logical_ingress_metadata(localnet_port, &zone_ids, 0, NULL,
>> +                                      ofpacts_p);
>>          replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p);
>>          replace_mac->mac = router_port_mac;
>>
>> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids,
struct ofpbuf *ofpacts_p)
>>  {
>>      if (zone_ids) {
>>          if (zone_ids->ct) {
>> -            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
>> +            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
>>          }
>>          if (zone_ids->dnat) {
>>              put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32,
ofpacts_p);
>> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key,
>>      }
>>  }
>>
>> +static size_t
>> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips,
>> +               const char *ip)
>> +{
>> +    for (size_t i = 0; i < n_encap_ips; i++) {
>> +        if (!strcmp(ip, encap_ips[i])) {
>> +            return i;
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>>  static void
>>  load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
>>                                const struct zone_ids *zone_ids,
>> +                              size_t n_encap_ips,
>> +                              const char **encap_ips,
>>                                struct ofpbuf *ofpacts_p)
>>  {
>>      put_zones_ofpacts(zone_ids, ofpacts_p);
>> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct
sbrec_port_binding *binding,
>>      uint32_t port_key = binding->tunnel_key;
>>      put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p);
>>      put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p);
>> +
>> +    /* Default encap_id 0. */
>> +    size_t encap_id = 0;
>> +    if (encap_ips && binding->encap) {
>> +        encap_id = encap_ip_to_id(n_encap_ips, encap_ips,
>> +                                  binding->encap->ip);
>> +    }
>> +    put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p);
>>  }
>>
>>  static const struct sbrec_port_binding *
>> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct
sbrec_port_binding *binding,
>>      match_set_dl_type(&match, htons(ETH_TYPE_RARP));
>>      match_set_in_port(&match, ofport);
>>
>> -    load_logical_ingress_metadata(binding, zone_ids, &ofpacts);
>> +    load_logical_ingress_metadata(binding, zone_ids, 0, NULL, &ofpacts);
>>
>>      encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
>>                           NX_CTLR_NO_METER, &ofpacts);
>> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports(
>>      }
>>
>>      struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
>> -                                               chassis_tunnels);
>> +                                               chassis_tunnels, NULL);
>>      if (ovs_list_is_empty(tuns)) {
>>          free(tuns);
>>          return;
>> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>>                        const struct sbrec_chassis *chassis,
>>                        const struct physical_debug *debug,
>>                        const struct if_status_mgr *if_mgr,
>> +                      size_t n_encap_ips,
>> +                      const char **encap_ips,
>>                        struct ovn_desired_flow_table *flow_table,
>>                        struct ofpbuf *ofpacts_p)
>>  {
>> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>>          ofpact_put_CT_CLEAR(ofpacts_p);
>>          put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
>>          put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
>> -        put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
>> +        put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
>>          struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
>> -        load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
>> +        load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips,
>> +                                      encap_ips, ofpacts_p);
>>          put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
>>          put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
>>          for (int i = 0; i < MFF_N_LOG_REGS; i++) {
>> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>>           * as we're going to remove this with ofpbuf_pull() later. */
>>          uint32_t ofpacts_orig_size = ofpacts_p->size;
>>
>> -        load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p);
>> +        load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips,
>> +                                      encap_ips, ofpacts_p);
>>
>>          if (!strcmp(binding->type, "localport")) {
>>              /* mark the packet as incoming from a localport */
>> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>>      } else {
>>          put_remote_port_redirect_overlay(
>>              binding, mff_ovn_geneve, port_key, &match, ofpacts_p,
>> -            chassis, chassis_tunnels, flow_table);
>> +            chassis, chassis_tunnels, n_encap_ips, encap_ips,
flow_table);
>>      }
>>  out:
>>      if (ha_ch_ordered) {
>> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>>
>>          int zone_id = simap_get(ct_zones, port->logical_port);
>>          if (zone_id) {
>> -            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts);
>> +            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts);
>>          }
>>
>>          const char *lport_name = (port->parent_port &&
*port->parent_port) ?
>> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx
*p_ctx,
>>                            p_ctx->patch_ofports,
>>                            p_ctx->chassis_tunnels,
>>                            pb, p_ctx->chassis, &p_ctx->debug,
>> -                          p_ctx->if_mgr, flow_table, &ofpacts);
>> +                          p_ctx->if_mgr,
>> +                          p_ctx->n_encap_ips,
>> +                          p_ctx->encap_ips,
>> +                          flow_table, &ofpacts);
>>      ofpbuf_uninit(&ofpacts);
>>  }
>>
>> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx,
>>                                p_ctx->patch_ofports,
>>                                p_ctx->chassis_tunnels, binding,
>>                                p_ctx->chassis, &p_ctx->debug,
>> -                              p_ctx->if_mgr, flow_table, &ofpacts);
>> +                              p_ctx->if_mgr,
>> +                              p_ctx->n_encap_ips,
>> +                              p_ctx->encap_ips,
>> +                              flow_table, &ofpacts);
>>      }
>>
>>      /* Handle output to multicast groups, in tables 40 and 41. */
>> diff --git a/controller/physical.h b/controller/physical.h
>> index 1f1ed55efa16..7fe8ee3c18ed 100644
>> --- a/controller/physical.h
>> +++ b/controller/physical.h
>> @@ -66,6 +66,8 @@ struct physical_ctx {
>>      struct shash *local_bindings;
>>      struct simap *patch_ofports;
>>      struct hmap *chassis_tunnels;
>> +    size_t n_encap_ips;
>> +    const char **encap_ips;
>>      struct physical_debug debug;
>>  };
>>
>> diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h
>> index 272277ec4fa0..d3455a462a0d 100644
>> --- a/include/ovn/logical-fields.h
>> +++ b/include/ovn/logical-fields.h
>> @@ -37,7 +37,9 @@ enum ovn_controller_event {
>>  #define MFF_LOG_SNAT_ZONE  MFF_REG12  /* conntrack snat zone for
gateway router
>>                                         * (32 bits). */
>>  #define MFF_LOG_CT_ZONE    MFF_REG13  /* Logical conntrack zone for
lports
>> -                                       * (32 bits). */
>> +                                       * (0..15 of the 32 bits). */
>> +#define MFF_LOG_ENCAP_ID   MFF_REG13  /* Encap ID for lports
>> +                                       * (16..31 of the 32 bits). */
>>  #define MFF_LOG_INPORT     MFF_REG14  /* Logical input port (32 bits).
*/
>>  #define MFF_LOG_OUTPORT    MFF_REG15  /* Logical output port (32 bits).
*/
>>
>> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
>> index 96294fe2b795..bfd8680cedfc 100644
>> --- a/ovn-architecture.7.xml
>> +++ b/ovn-architecture.7.xml
>> @@ -1247,8 +1247,8 @@
>>        chassis.  This is initialized to 0 at the beginning of the logical
>>          <!-- Keep the following in sync with MFF_LOG_CT_ZONE in
>>               ovn/lib/logical-fields.h. -->
>> -      ingress pipeline.  OVN stores this in Open vSwitch extension
register
>> -      number 13.
>> +      ingress pipeline.  OVN stores this in the lower 16 bits of the
Open
>> +      vSwitch extension register number 13.
>>      </dd>
>>
>>      <dt>conntrack zone fields for routers</dt>
>> @@ -1263,6 +1263,20 @@
>>        traffic (for SNATing) in Open vSwitch extension register number
12.
>>      </dd>
>>
>> +    <dt>Encap ID for logical ports</dt>
>> +    <dd>
>> +      A field that records an ID that indicates which encapsulation IP
should
>> +      be used when sending packets to a remote chassis, according to the
>> +      original input logical port. This is useful when there are
multiple IPs
>> +      available for encapsulation. The value only has local
significance and is
>> +      not meaningful between chassis. This is initialized to 0 at the
beginning
>> +      of the logical
>> +        <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in
>> +             ovn/lib/logical-fields.h. -->
>> +      ingress pipeline.  OVN stores this in the higher 16 bits of the
Open
>> +      vSwitch extension register number 13.
>> +    </dd>
>> +
>>      <dt>logical flow flags</dt>
>>      <dd>
>>        The logical flags are intended to handle keeping context between
>> diff --git a/tests/ovn.at b/tests/ovn.at
>> index 243fe0b8246c..e7f0c9681f60 100644
>> --- a/tests/ovn.at
>> +++ b/tests/ovn.at
>> @@ -30335,19 +30335,33 @@ AT_CLEANUP
>>
>>
>>  OVN_FOR_EACH_NORTHD([
>> -AT_SETUP([multiple encap ips tunnel creation])
>> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip])
>> +AT_SKIP_IF([test $HAVE_SCAPY = no])
>>  ovn_start
>>  net_add n1
>>
>> +ovn-nbctl ls-add ls1
>> +
>>  # 2 HVs, each with 2 encap-ips.
>> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY.
>>  for i in 1 2; do
>>      sim_add hv$i
>>      as hv$i
>>      ovs-vsctl add-br br-phys-$j
>>      ovn_attach n1 br-phys-$j 192.168.0.${i}1
>>      ovs-vsctl set open .
external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2
>> +
>> +    for j in 1 2; do
>> +        ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \
>> +            external_ids:iface-id=lsp$i$j \
>> +            external_ids:encap-ip=192.168.0.$i$j \
>> +            options:tx_pcap=hv$i/vif$i$j-tx.pcap
options:rxq_pcap=hv$i/vif$i$j-rx.pcap
>> +        ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j
"f0:00:00:00:00:$i$j 10.0.0.$i$j"
>> +
>> +    done
>>  done
>>
>> +wait_for_ports_up
>>  check ovn-nbctl --wait=hv sync
>>
>>  check_tunnel_port() {
>> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12
%192.168.0.21
>>  check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22
>>  check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22
>>
>> +vif_to_hv() {
>> +    case $1 in dnl (
>> +        vif[[1]]?) echo hv1 ;; dnl (
>> +        vif[[2]]?) echo hv2 ;; dnl (
>> +        *) AT_FAIL_IF([:]) ;;
>> +    esac
>> +}
>> +
>> +# check_packet_tunnel SRC_VIF DST_VIF
>> +# Verify that a packet from SRC_VIF to DST_VIF goes through the
corresponding
>> +# tunnel that matches the VIFs' encap_ip configurations.
>> +check_packet_tunnel() {
>> +    local src=$1 dst=$2
>> +    local src_mac=f0:00:00:00:00:$src
>> +    local dst_mac=f0:00:00:00:00:$dst
>> +    local src_ip=10.0.0.$src
>> +    local dst_ip=10.0.0.$dst
>> +    local local_encap_ip=192.168.0.$src
>> +    local remote_encap_ip=192.168.0.$dst
>> +    local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \
>> +                            IP(dst='${dst_ip}', src='${src_ip}')/ \
>> +                            ICMP(type=8)")
>> +    hv=`vif_to_hv vif$src`
>> +    as $hv
>> +    echo "vif$src -> vif$dst should go through tunnel $local_encap_ip
-> $remote_encap_ip"
>> +    tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface
options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip)
>> +    AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src
$packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport])
>> +}
>> +
>> +for i in 1 2; do
>> +    for j in 1 2; do
>> +        check_packet_tunnel 1$i 2$j
>> +    done
>> +done
>> +
>>  OVN_CLEANUP([hv1],[hv2])
>>  AT_CLEANUP
>>  ])
>> --
>> 2.38.1
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>
> Thanks,
> Ales
>
> --
>
> Ales Musil
>
> Senior Software Engineer - OVN Core
>
> Red Hat EMEA
>
> amusil@redhat.com
Ales Musil Jan. 26, 2024, 6:54 a.m. UTC | #3
On Fri, Jan 26, 2024 at 4:07 AM Han Zhou <hzhou@ovn.org> wrote:

>
>
> On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote:
> >
> >
> >
> > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote:
> >>
> >> Commit dd527a283cd8 partially supported multiple encap IPs. It supported
> >> remote encap IP selection based on the destination VIF's encap_ip
> >> configuration. This patch adds the support for selecting local encap IP
> >> based on the source VIF's encap_ip configuration.
> >>
> >> Co-authored-by: Lei Huang <leih@nvidia.com>
> >> Signed-off-by: Lei Huang <leih@nvidia.com>
> >> Signed-off-by: Han Zhou <hzhou@ovn.org>
> >> ---
> >
> >
> > Hi Han and Lei,
> >
> > thank you for the patch, I have a couple of comments/questions down
> below.
>
>
> Thanks Ales.
>
> >
> >
> >>  NEWS                            |   3 +
> >>  controller/chassis.c            |   2 +-
> >>  controller/local_data.c         |   2 +-
> >>  controller/local_data.h         |   2 +-
> >>  controller/ovn-controller.8.xml |  30 ++++++-
> >>  controller/ovn-controller.c     |  49 ++++++++++++
> >>  controller/physical.c           | 134 ++++++++++++++++++++++----------
> >>  controller/physical.h           |   2 +
> >>  include/ovn/logical-fields.h    |   4 +-
> >>  ovn-architecture.7.xml          |  18 ++++-
> >>  tests/ovn.at                    |  51 +++++++++++-
> >>  11 files changed, 243 insertions(+), 54 deletions(-)
> >>
> >> diff --git a/NEWS b/NEWS
> >> index 5f267b4c64cc..5a3eed608617 100644
> >> --- a/NEWS
> >> +++ b/NEWS
> >> @@ -14,6 +14,9 @@ Post v23.09.0
> >>    - ovn-northd-ddlog has been removed.
> >>    - A new LSP option "enable_router_port_acl" has been added to enable
> >>      conntrack for the router port whose peer is l3dgw_port if set it
> true.
> >> +  - Support selecting encapsulation IP based on the source/destination
> VIF's
> >> +    settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for
> more
> >> +    details.
> >>
> >>  OVN v23.09.0 - 15 Sep 2023
> >>  --------------------------
> >> diff --git a/controller/chassis.c b/controller/chassis.c
> >> index a6f13ccc42d5..55f2beb37674 100644
> >> --- a/controller/chassis.c
> >> +++ b/controller/chassis.c
> >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg {
> >>
> >>      /* Set of encap types parsed from the 'ovn-encap-type'
> external-id. */
> >>      struct sset encap_type_set;
> >> -    /* Set of encap IPs parsed from the 'ovn-encap-type' external-id.
> */
> >> +    /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */
> >>      struct sset encap_ip_set;
> >>      /* Interface type list formatted in the OVN-SB Chassis required
> format. */
> >>      struct ds iface_types;
> >> diff --git a/controller/local_data.c b/controller/local_data.c
> >> index a9092783958f..8606414f8728 100644
> >> --- a/controller/local_data.c
> >> +++ b/controller/local_data.c
> >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap
> *chassis_tunnels)
> >>   */
> >>  struct chassis_tunnel *
> >>  chassis_tunnel_find(const struct hmap *chassis_tunnels, const char
> *chassis_id,
> >> -                    char *remote_encap_ip, char *local_encap_ip)
> >> +                    char *remote_encap_ip, const char *local_encap_ip)
> >
> >
> > nit: Unrelated change.
>
>
> Ack
>
> >
> >
> >>  {
> >>      /*
> >>       * If the specific encap_ip is given, look for the chassisid_ip
> entry,
> >> diff --git a/controller/local_data.h b/controller/local_data.h
> >> index bab95bcc3824..ca3905bd20e6 100644
> >> --- a/controller/local_data.h
> >> +++ b/controller/local_data.h
> >> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes(
> >>  struct chassis_tunnel *chassis_tunnel_find(const struct hmap
> *chassis_tunnels,
> >>                                             const char *chassis_id,
> >>                                             char *remote_encap_ip,
> >> -                                           char *local_encap_ip);
> >> +                                           const char *local_encap_ip);
> >
> >
> > Same as above.
>
>
> Ack
>
> >
> >
> >>
> >>  bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels,
> >>                                 const char *chassis_name,
> >> diff --git a/controller/ovn-controller.8.xml
> b/controller/ovn-controller.8.xml
> >> index efa65e3fd927..5ebef048d721 100644
> >> --- a/controller/ovn-controller.8.xml
> >> +++ b/controller/ovn-controller.8.xml
> >> @@ -176,10 +176,32 @@
> >>
> >>        <dt><code>external_ids:ovn-encap-ip</code></dt>
> >>        <dd>
> >> -        The IP address that a chassis should use to connect to this
> node
> >> -        using encapsulation types specified by
> >> -        <code>external_ids:ovn-encap-type</code>. Multiple
> encapsulation IPs
> >> -        may be specified with a comma-separated list.
> >> +        <p>
> >> +          The IP address that a chassis should use to connect to this
> node
> >> +          using encapsulation types specified by
> >> +          <code>external_ids:ovn-encap-type</code>. Multiple
> encapsulation IPs
> >> +          may be specified with a comma-separated list.
> >> +        </p>
> >> +        <p>
> >> +          In scenarios where multiple encapsulation IPs are present,
> distinct
> >> +          tunnels are established for each remote chassis. These
> tunnels are
> >> +          differentiated by setting unique
> <code>options:local_ip</code> and
> >> +          <code>options:remote_ip</code> values in the tunnel
> interface. When
> >> +          transmitting a packet to a remote chassis, the selection of
> local_ip
> >> +          is guided by the
> <code>Interface:external_ids:encap-ip</code> from
> >> +          the local OVSDB, corresponding to the VIF originating the
> packet, if
> >> +          specified. The <code>Interface:external_ids:encap-ip</code>
> setting
> >> +          of the VIF is also populated to the <code>Port_Binding</code>
> >> +          table in the OVN SB database via the <code>encap</code>
> column.
> >> +          Consequently, when a remote chassis needs to send a packet
> to a
> >> +          port-binding associated with this VIF, it utilizes the
> tunnel with
> >> +          the appropriate <code>options:remote_ip</code> that matches
> the
> >> +          <code>ip</code> in <code>Port_Binding:encap</code>. This
> mechanism
> >> +          is particularly beneficial for chassis with multiple physical
> >> +          interfaces designated for tunneling, where each interface is
> >> +          optimized for handling specific traffic associated with
> particular
> >> +          VIFs.
> >> +        </p>
> >>        </dd>
> >>
> >>        <dt><code>external_ids:ovn-encap-df_default</code></dt>
> >> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> >> index 856e5e270822..4ab57fe970fe 100644
> >> --- a/controller/ovn-controller.c
> >> +++ b/controller/ovn-controller.c
> >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output {
> >>      struct physical_debug debug;
> >>  };
> >>
> >> +static void
> >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table,
> >> +                size_t *n_encap_ips, const char * **encap_ips)
> >> +{
> >> +    const struct ovsrec_open_vswitch *cfg =
> >> +        ovsrec_open_vswitch_table_first(ovs_table);
> >> +    const char *chassis_id = get_ovs_chassis_id(ovs_table);
> >> +    const char *encap_ips_str =
> >> +        get_chassis_external_id_value(&cfg->external_ids, chassis_id,
> >> +                                      "ovn-encap-ip", NULL);
> >> +    struct sset encap_ip_set;
> >> +    sset_init(&encap_ip_set);
> >> +    sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");
> >> +
> >> +    /* Sort the ips so that their index is deterministic. */
> >> +    *encap_ips = sset_sort(&encap_ip_set);
> >> +
> >> +    /* Copy the strings so that we can destroy the sset. */
> >> +    for (size_t i = 0; (*encap_ips)[i]; i++) {
> >> +        (*encap_ips)[i] = xstrdup((*encap_ips)[i]);
> >> +    }
> >> +    *n_encap_ips = sset_count(&encap_ip_set);
> >> +    sset_destroy(&encap_ip_set);
> >> +}
> >> +
> >
> >
> > I wonder, could we store this array or maybe preferably svec in
> "en_non_vif_data" I-P node? This way it would be handled in the node and we
> could avoid the destroy calls after any pflow processing WDYT?
>
>
> Yes we could do the same in en_non_vif_data, but I think it is not really
> necessary and it seems more straightforward just parsing it here, because:
> 1. We don't need I-P for this ovn-encap-ip configuration change, so we
> don't have to persist this data.
> 2. It should be very rare to have more than 5 (or even 3) encap IPs per
> node, so the list should be very small and the cost of this parsing (and
> sset construct/destroy) is negligible.
> Using svec is also a valid option, but the sset (and array) is used here
> just to reuse the sset_from_delimited_string and sset_sort for convenience.
> I noticed that the sset_init() call can in fact be removed because
> sset_from_delimited_string already does that. I can remove it.
> Does this sound reasonable to you?
>

It makes sense, the main thing it would help with is the need to destroy
the ctx in kinda unexpected places which might be slightly error prone.
However, it being functionally the same it's fine either way.

Thanks,
Ales


>
> Thanks,
> Han
>
> >
> >>
> >>  static void init_physical_ctx(struct engine_node *node,
> >>                                struct ed_type_runtime_data *rt_data,
> >>                                struct ed_type_non_vif_data
> *non_vif_data,
> >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct engine_node
> *node,
> >>          engine_get_input_data("ct_zones", node);
> >>      struct simap *ct_zones = &ct_zones_data->current;
> >>
> >> +    parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips);
> >>      p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name;
> >>      p_ctx->sbrec_port_binding_by_datapath =
> sbrec_port_binding_by_datapath;
> >>      p_ctx->port_binding_table = port_binding_table;
> >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct
> engine_node *node,
> >>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
> >>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
> >>
> >> +
> >> +
> >>
> >
> > nit: Unrelated change.
>
>
> Ack
> >
> >
> >>
> >>      struct controller_engine_ctx *ctrl_ctx =
> engine_get_context()->client_ctx;
> >>      p_ctx->if_mgr = ctrl_ctx->if_mgr;
> >>
> >>      pflow_output_get_debug(node, &p_ctx->debug);
> >>  }
> >>
> >> +static void
> >> +destroy_physical_ctx(struct physical_ctx *p_ctx)
> >> +{
> >> +    for (size_t i = 0; i < p_ctx->n_encap_ips; i++) {
> >> +        free((char *)(p_ctx->encap_ips[i]));
> >> +    }
> >> +    free(p_ctx->encap_ips);
> >> +}
> >> +
> >>  static void *
> >>  en_pflow_output_init(struct engine_node *node OVS_UNUSED,
> >>                               struct engine_arg *arg OVS_UNUSED)
> >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node,
> void *data)
> >>      struct physical_ctx p_ctx;
> >>      init_physical_ctx(node, rt_data, non_vif_data, &p_ctx);
> >>      physical_run(&p_ctx, pflow_table);
> >> +    destroy_physical_ctx(&p_ctx);
> >>
> >>      engine_set_node_state(node, EN_UPDATED);
> >>  }
> >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct
> engine_node *node,
> >>                  bool removed = sbrec_port_binding_is_deleted(binding);
> >>                  if (!physical_handle_flows_for_lport(binding, removed,
> &p_ctx,
> >>
> &pfo->flow_table)) {
> >> +                    destroy_physical_ctx(&p_ctx);
> >>                      return false;
> >>                  }
> >>              }
> >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct
> engine_node *node,
> >>              bool removed = sbrec_port_binding_is_deleted(pb);
> >>              if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
> >>                                                   &pfo->flow_table)) {
> >> +                destroy_physical_ctx(&p_ctx);
> >>                  return false;
> >>              }
> >>          }
> >>          engine_set_node_state(node, EN_UPDATED);
> >>      }
> >> +    destroy_physical_ctx(&p_ctx);
> >>      return true;
> >>  }
> >>
> >> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct
> engine_node *node,
> >>          bool removed = sbrec_port_binding_is_deleted(pb);
> >>          if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
> >>                                               &pfo->flow_table)) {
> >> +            destroy_physical_ctx(&p_ctx);
> >>              return false;
> >>          }
> >>      }
> >>
> >>      engine_set_node_state(node, EN_UPDATED);
> >> +    destroy_physical_ctx(&p_ctx);
> >>      return true;
> >>  }
> >>
> >> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct
> engine_node *node, void *data)
> >>      physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table);
> >>
> >>      engine_set_node_state(node, EN_UPDATED);
> >> +    destroy_physical_ctx(&p_ctx);
> >>      return true;
> >>  }
> >>
> >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct
> engine_node *node, void *data)
> >>          if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) {
> >>              /* Fall back to full recompute when a local datapath
> >>               * is added or deleted. */
> >> +            destroy_physical_ctx(&p_ctx);
> >>              return false;
> >>          }
> >>
> >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct
> engine_node *node, void *data)
> >>                  lport->tracked_type == TRACKED_RESOURCE_REMOVED ?
> true: false;
> >>              if (!physical_handle_flows_for_lport(lport->pb, removed,
> &p_ctx,
> >>                                                   &pfo->flow_table)) {
> >> +                destroy_physical_ctx(&p_ctx);
> >>                  return false;
> >>              }
> >>          }
> >>      }
> >>
> >>      engine_set_node_state(node, EN_UPDATED);
> >> +    destroy_physical_ctx(&p_ctx);
> >>      return true;
> >>  }
> >>
> >> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct
> engine_node *node, void *data)
> >>          if (pb) {
> >>              if (!physical_handle_flows_for_lport(pb, false, &p_ctx,
> >>                                                   &pfo->flow_table)) {
> >> +                destroy_physical_ctx(&p_ctx);
> >>                  return false;
> >>              }
> >>              tag_port_as_activated_in_engine(pp);
> >>          }
> >>      }
> >>      engine_set_node_state(node, EN_UPDATED);
> >> +    destroy_physical_ctx(&p_ctx);
> >>      return true;
> >>  }
> >>
> >> diff --git a/controller/physical.c b/controller/physical.c
> >> index e93bfbd2cffb..c192aed751d5 100644
> >> --- a/controller/physical.c
> >> +++ b/controller/physical.c
> >> @@ -71,6 +71,8 @@ struct tunnel {
> >>  static void
> >>  load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
> >>                                const struct zone_ids *zone_ids,
> >> +                              size_t n_encap_ips,
> >> +                              const char **encap_ips,
> >>                                struct ofpbuf *ofpacts_p);
> >>  static int64_t get_vxlan_port_key(int64_t port_key);
> >>
> >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf
> *ofpacts)
> >>   */
> >>  static struct chassis_tunnel *
> >>  get_port_binding_tun(const struct sbrec_encap *remote_encap,
> >> -                     const struct sbrec_encap *local_encap,
> >>                       const struct sbrec_chassis *chassis,
> >> -                     const struct hmap *chassis_tunnels)
> >> +                     const struct hmap *chassis_tunnels,
> >> +                     const char *local_encap_ip)
> >>  {
> >>      struct chassis_tunnel *tun = NULL;
> >>      tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
> >>                                remote_encap ? remote_encap->ip : NULL,
> >> -                              local_encap ? local_encap->ip : NULL);
> >> +                              local_encap_ip);
> >>
> >>      if (!tun) {
> >>          tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
> NULL, NULL);
> >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct
> sbrec_port_binding *pb,
> >>  static struct ovs_list *
> >>  get_remote_tunnels(const struct sbrec_port_binding *binding,
> >>                     const struct sbrec_chassis *chassis,
> >> -                   const struct hmap *chassis_tunnels)
> >> +                   const struct hmap *chassis_tunnels,
> >> +                   const char *local_encap_ip)
> >>  {
> >>      const struct chassis_tunnel *tun;
> >>
> >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct sbrec_port_binding
> *binding,
> >>      ovs_list_init(tunnels);
> >>
> >>      if (binding->chassis && binding->chassis != chassis) {
> >> -        tun = get_port_binding_tun(binding->encap, NULL,
> binding->chassis,
> >> -                                   chassis_tunnels);
> >> +        tun = get_port_binding_tun(binding->encap, binding->chassis,
> >> +                chassis_tunnels, local_encap_ip);
> >>          if (!tun) {
> >>              static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5,
> 1);
> >>              VLOG_WARN_RL(
> >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct sbrec_port_binding
> *binding,
> >>          }
> >>          const struct sbrec_encap *additional_encap;
> >>          additional_encap = find_additional_encap_for_chassis(binding,
> chassis);
> >> -        tun = get_port_binding_tun(additional_encap, NULL,
> >> +        tun = get_port_binding_tun(additional_encap,
> >>                                     binding->additional_chassis[i],
> >> -                                   chassis_tunnels);
> >> +                                   chassis_tunnels, local_encap_ip);
> >>          if (!tun) {
> >>              static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5,
> 1);
> >>              VLOG_WARN_RL(
> >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct
> sbrec_port_binding *binding,
> >>                                   struct ofpbuf *ofpacts_p,
> >>                                   const struct sbrec_chassis *chassis,
> >>                                   const struct hmap *chassis_tunnels,
> >> +                                 size_t n_encap_ips,
> >> +                                 const char **encap_ips,
> >>                                   struct ovn_desired_flow_table
> *flow_table)
> >>  {
> >>      /* Setup encapsulation */
> >> -    struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> >> -                                               chassis_tunnels);
> >> -    if (!ovs_list_is_empty(tuns)) {
> >> -        bool is_vtep_port = !strcmp(binding->type, "vtep");
> >> -        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for
> ARP/ND
> >> -         * responder in L3 networks. */
> >> -        if (is_vtep_port) {
> >> -            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
> ofpacts_p);
> >> -        }
> >> +    for (size_t i = 0; i < n_encap_ips; i++) {
> >> +        struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p);
> >>
> >> +
> >> +        match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i <<
> 16,
> >> +                             (uint32_t) 0xFFFF << 16);
> >> +        struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> >> +                                                   chassis_tunnels,
> >> +                                                   encap_ips[i]);
> >> +        if (!ovs_list_is_empty(tuns)) {
> >> +            bool is_vtep_port = !strcmp(binding->type, "vtep");
> >> +            /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check
> for ARP/ND
> >> +             * responder in L3 networks. */
> >> +            if (is_vtep_port) {
> >> +                put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
> >> +                         ofpacts_clone);
> >> +            }
> >>
> >> -        struct tunnel *tun;
> >> -        LIST_FOR_EACH (tun, list_node, tuns) {
> >> -            put_encapsulation(mff_ovn_geneve, tun->tun,
> >> -                              binding->datapath, port_key,
> is_vtep_port,
> >> -                              ofpacts_p);
> >> -            ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport;
> >> +            struct tunnel *tun;
> >> +            LIST_FOR_EACH (tun, list_node, tuns) {
> >> +                put_encapsulation(mff_ovn_geneve, tun->tun,
> >> +                                  binding->datapath, port_key,
> is_vtep_port,
> >> +                                  ofpacts_clone);
> >> +                ofpact_put_OUTPUT(ofpacts_clone)->port =
> tun->tun->ofport;
> >> +            }
> >> +            put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone);
> >> +            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
> >> +                            binding->header_.uuid.parts[0], match,
> >> +                            ofpacts_clone, &binding->header_.uuid);
> >>          }
> >> -        put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
> >> -        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
> >> -                        binding->header_.uuid.parts[0], match,
> ofpacts_p,
> >> -                        &binding->header_.uuid);
> >> -    }
> >> -    struct tunnel *tun_elem;
> >> -    LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
> >> -        free(tun_elem);
> >> +        struct tunnel *tun_elem;
> >> +        LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
> >> +            free(tun_elem);
> >> +        }
> >> +        free(tuns);
> >> +
> >> +        ofpbuf_delete(ofpacts_clone);
> >>      }
> >> -    free(tuns);
> >>  }
> >>
> >>  static void
> >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap
> *ct_zones,
> >>          if (tag) {
> >>              ofpact_put_STRIP_VLAN(ofpacts_p);
> >>          }
> >> -        load_logical_ingress_metadata(localnet_port, &zone_ids,
> ofpacts_p);
> >> +        load_logical_ingress_metadata(localnet_port, &zone_ids, 0,
> NULL,
> >> +                                      ofpacts_p);
> >>          replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p);
> >>          replace_mac->mac = router_port_mac;
> >>
> >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids *zone_ids,
> struct ofpbuf *ofpacts_p)
> >>  {
> >>      if (zone_ids) {
> >>          if (zone_ids->ct) {
> >> -            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
> >> +            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
> >>          }
> >>          if (zone_ids->dnat) {
> >>              put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32,
> ofpacts_p);
> >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key,
> >>      }
> >>  }
> >>
> >> +static size_t
> >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips,
> >> +               const char *ip)
> >> +{
> >> +    for (size_t i = 0; i < n_encap_ips; i++) {
> >> +        if (!strcmp(ip, encap_ips[i])) {
> >> +            return i;
> >> +        }
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >>  static void
> >>  load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
> >>                                const struct zone_ids *zone_ids,
> >> +                              size_t n_encap_ips,
> >> +                              const char **encap_ips,
> >>                                struct ofpbuf *ofpacts_p)
> >>  {
> >>      put_zones_ofpacts(zone_ids, ofpacts_p);
> >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct
> sbrec_port_binding *binding,
> >>      uint32_t port_key = binding->tunnel_key;
> >>      put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p);
> >>      put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p);
> >> +
> >> +    /* Default encap_id 0. */
> >> +    size_t encap_id = 0;
> >> +    if (encap_ips && binding->encap) {
> >> +        encap_id = encap_ip_to_id(n_encap_ips, encap_ips,
> >> +                                  binding->encap->ip);
> >> +    }
> >> +    put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p);
> >>  }
> >>
> >>  static const struct sbrec_port_binding *
> >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct
> sbrec_port_binding *binding,
> >>      match_set_dl_type(&match, htons(ETH_TYPE_RARP));
> >>      match_set_in_port(&match, ofport);
> >>
> >> -    load_logical_ingress_metadata(binding, zone_ids, &ofpacts);
> >> +    load_logical_ingress_metadata(binding, zone_ids, 0, NULL,
> &ofpacts);
> >>
> >>      encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
> >>                           NX_CTLR_NO_METER, &ofpacts);
> >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports(
> >>      }
> >>
> >>      struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> >> -                                               chassis_tunnels);
> >> +                                               chassis_tunnels, NULL);
> >>      if (ovs_list_is_empty(tuns)) {
> >>          free(tuns);
> >>          return;
> >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >>                        const struct sbrec_chassis *chassis,
> >>                        const struct physical_debug *debug,
> >>                        const struct if_status_mgr *if_mgr,
> >> +                      size_t n_encap_ips,
> >> +                      const char **encap_ips,
> >>                        struct ovn_desired_flow_table *flow_table,
> >>                        struct ofpbuf *ofpacts_p)
> >>  {
> >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >>          ofpact_put_CT_CLEAR(ofpacts_p);
> >>          put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
> >>          put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
> >> -        put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
> >> +        put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
> >>          struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
> >> -        load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
> >> +        load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips,
> >> +                                      encap_ips, ofpacts_p);
> >>          put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
> >>          put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
> >>          for (int i = 0; i < MFF_N_LOG_REGS; i++) {
> >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >>           * as we're going to remove this with ofpbuf_pull() later. */
> >>          uint32_t ofpacts_orig_size = ofpacts_p->size;
> >>
> >> -        load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p);
> >> +        load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips,
> >> +                                      encap_ips, ofpacts_p);
> >>
> >>          if (!strcmp(binding->type, "localport")) {
> >>              /* mark the packet as incoming from a localport */
> >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >>      } else {
> >>          put_remote_port_redirect_overlay(
> >>              binding, mff_ovn_geneve, port_key, &match, ofpacts_p,
> >> -            chassis, chassis_tunnels, flow_table);
> >> +            chassis, chassis_tunnels, n_encap_ips, encap_ips,
> flow_table);
> >>      }
> >>  out:
> >>      if (ha_ch_ordered) {
> >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >>
> >>          int zone_id = simap_get(ct_zones, port->logical_port);
> >>          if (zone_id) {
> >> -            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts);
> >> +            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts);
> >>          }
> >>
> >>          const char *lport_name = (port->parent_port &&
> *port->parent_port) ?
> >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx
> *p_ctx,
> >>                            p_ctx->patch_ofports,
> >>                            p_ctx->chassis_tunnels,
> >>                            pb, p_ctx->chassis, &p_ctx->debug,
> >> -                          p_ctx->if_mgr, flow_table, &ofpacts);
> >> +                          p_ctx->if_mgr,
> >> +                          p_ctx->n_encap_ips,
> >> +                          p_ctx->encap_ips,
> >> +                          flow_table, &ofpacts);
> >>      ofpbuf_uninit(&ofpacts);
> >>  }
> >>
> >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx,
> >>                                p_ctx->patch_ofports,
> >>                                p_ctx->chassis_tunnels, binding,
> >>                                p_ctx->chassis, &p_ctx->debug,
> >> -                              p_ctx->if_mgr, flow_table, &ofpacts);
> >> +                              p_ctx->if_mgr,
> >> +                              p_ctx->n_encap_ips,
> >> +                              p_ctx->encap_ips,
> >> +                              flow_table, &ofpacts);
> >>      }
> >>
> >>      /* Handle output to multicast groups, in tables 40 and 41. */
> >> diff --git a/controller/physical.h b/controller/physical.h
> >> index 1f1ed55efa16..7fe8ee3c18ed 100644
> >> --- a/controller/physical.h
> >> +++ b/controller/physical.h
> >> @@ -66,6 +66,8 @@ struct physical_ctx {
> >>      struct shash *local_bindings;
> >>      struct simap *patch_ofports;
> >>      struct hmap *chassis_tunnels;
> >> +    size_t n_encap_ips;
> >> +    const char **encap_ips;
> >>      struct physical_debug debug;
> >>  };
> >>
> >> diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h
> >> index 272277ec4fa0..d3455a462a0d 100644
> >> --- a/include/ovn/logical-fields.h
> >> +++ b/include/ovn/logical-fields.h
> >> @@ -37,7 +37,9 @@ enum ovn_controller_event {
> >>  #define MFF_LOG_SNAT_ZONE  MFF_REG12  /* conntrack snat zone for
> gateway router
> >>                                         * (32 bits). */
> >>  #define MFF_LOG_CT_ZONE    MFF_REG13  /* Logical conntrack zone for
> lports
> >> -                                       * (32 bits). */
> >> +                                       * (0..15 of the 32 bits). */
> >> +#define MFF_LOG_ENCAP_ID   MFF_REG13  /* Encap ID for lports
> >> +                                       * (16..31 of the 32 bits). */
> >>  #define MFF_LOG_INPORT     MFF_REG14  /* Logical input port (32 bits).
> */
> >>  #define MFF_LOG_OUTPORT    MFF_REG15  /* Logical output port (32
> bits). */
> >>
> >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
> >> index 96294fe2b795..bfd8680cedfc 100644
> >> --- a/ovn-architecture.7.xml
> >> +++ b/ovn-architecture.7.xml
> >> @@ -1247,8 +1247,8 @@
> >>        chassis.  This is initialized to 0 at the beginning of the
> logical
> >>          <!-- Keep the following in sync with MFF_LOG_CT_ZONE in
> >>               ovn/lib/logical-fields.h. -->
> >> -      ingress pipeline.  OVN stores this in Open vSwitch extension
> register
> >> -      number 13.
> >> +      ingress pipeline.  OVN stores this in the lower 16 bits of the
> Open
> >> +      vSwitch extension register number 13.
> >>      </dd>
> >>
> >>      <dt>conntrack zone fields for routers</dt>
> >> @@ -1263,6 +1263,20 @@
> >>        traffic (for SNATing) in Open vSwitch extension register number
> 12.
> >>      </dd>
> >>
> >> +    <dt>Encap ID for logical ports</dt>
> >> +    <dd>
> >> +      A field that records an ID that indicates which encapsulation IP
> should
> >> +      be used when sending packets to a remote chassis, according to
> the
> >> +      original input logical port. This is useful when there are
> multiple IPs
> >> +      available for encapsulation. The value only has local
> significance and is
> >> +      not meaningful between chassis. This is initialized to 0 at the
> beginning
> >> +      of the logical
> >> +        <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in
> >> +             ovn/lib/logical-fields.h. -->
> >> +      ingress pipeline.  OVN stores this in the higher 16 bits of the
> Open
> >> +      vSwitch extension register number 13.
> >> +    </dd>
> >> +
> >>      <dt>logical flow flags</dt>
> >>      <dd>
> >>        The logical flags are intended to handle keeping context between
> >> diff --git a/tests/ovn.at b/tests/ovn.at
> >> index 243fe0b8246c..e7f0c9681f60 100644
> >> --- a/tests/ovn.at
> >> +++ b/tests/ovn.at
> >> @@ -30335,19 +30335,33 @@ AT_CLEANUP
> >>
> >>
> >>  OVN_FOR_EACH_NORTHD([
> >> -AT_SETUP([multiple encap ips tunnel creation])
> >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip])
> >> +AT_SKIP_IF([test $HAVE_SCAPY = no])
> >>  ovn_start
> >>  net_add n1
> >>
> >> +ovn-nbctl ls-add ls1
> >> +
> >>  # 2 HVs, each with 2 encap-ips.
> >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY.
> >>  for i in 1 2; do
> >>      sim_add hv$i
> >>      as hv$i
> >>      ovs-vsctl add-br br-phys-$j
> >>      ovn_attach n1 br-phys-$j 192.168.0.${i}1
> >>      ovs-vsctl set open .
> external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2
> >> +
> >> +    for j in 1 2; do
> >> +        ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \
> >> +            external_ids:iface-id=lsp$i$j \
> >> +            external_ids:encap-ip=192.168.0.$i$j \
> >> +            options:tx_pcap=hv$i/vif$i$j-tx.pcap
> options:rxq_pcap=hv$i/vif$i$j-rx.pcap
> >> +        ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j
> "f0:00:00:00:00:$i$j 10.0.0.$i$j"
> >> +
> >> +    done
> >>  done
> >>
> >> +wait_for_ports_up
> >>  check ovn-nbctl --wait=hv sync
> >>
> >>  check_tunnel_port() {
> >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12
> %192.168.0.21
> >>  check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22
> >>  check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22
> >>
> >> +vif_to_hv() {
> >> +    case $1 in dnl (
> >> +        vif[[1]]?) echo hv1 ;; dnl (
> >> +        vif[[2]]?) echo hv2 ;; dnl (
> >> +        *) AT_FAIL_IF([:]) ;;
> >> +    esac
> >> +}
> >> +
> >> +# check_packet_tunnel SRC_VIF DST_VIF
> >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the
> corresponding
> >> +# tunnel that matches the VIFs' encap_ip configurations.
> >> +check_packet_tunnel() {
> >> +    local src=$1 dst=$2
> >> +    local src_mac=f0:00:00:00:00:$src
> >> +    local dst_mac=f0:00:00:00:00:$dst
> >> +    local src_ip=10.0.0.$src
> >> +    local dst_ip=10.0.0.$dst
> >> +    local local_encap_ip=192.168.0.$src
> >> +    local remote_encap_ip=192.168.0.$dst
> >> +    local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/
> \
> >> +                            IP(dst='${dst_ip}', src='${src_ip}')/ \
> >> +                            ICMP(type=8)")
> >> +    hv=`vif_to_hv vif$src`
> >> +    as $hv
> >> +    echo "vif$src -> vif$dst should go through tunnel $local_encap_ip
> -> $remote_encap_ip"
> >> +    tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface
> options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip)
> >> +    AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src
> $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport])
> >> +}
> >> +
> >> +for i in 1 2; do
> >> +    for j in 1 2; do
> >> +        check_packet_tunnel 1$i 2$j
> >> +    done
> >> +done
> >> +
> >>  OVN_CLEANUP([hv1],[hv2])
> >>  AT_CLEANUP
> >>  ])
> >> --
> >> 2.38.1
> >>
> >> _______________________________________________
> >> dev mailing list
> >> dev@openvswitch.org
> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>
> >
> > Thanks,
> > Ales
> >
> > --
> >
> > Ales Musil
> >
> > Senior Software Engineer - OVN Core
> >
> > Red Hat EMEA
> >
> > amusil@redhat.com
>
Han Zhou Jan. 26, 2024, 7:05 p.m. UTC | #4
On Thu, Jan 25, 2024 at 10:54 PM Ales Musil <amusil@redhat.com> wrote:
>
>
>
> On Fri, Jan 26, 2024 at 4:07 AM Han Zhou <hzhou@ovn.org> wrote:
>>
>>
>>
>> On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote:
>> >
>> >
>> >
>> > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote:
>> >>
>> >> Commit dd527a283cd8 partially supported multiple encap IPs. It
supported
>> >> remote encap IP selection based on the destination VIF's encap_ip
>> >> configuration. This patch adds the support for selecting local encap
IP
>> >> based on the source VIF's encap_ip configuration.
>> >>
>> >> Co-authored-by: Lei Huang <leih@nvidia.com>
>> >> Signed-off-by: Lei Huang <leih@nvidia.com>
>> >> Signed-off-by: Han Zhou <hzhou@ovn.org>
>> >> ---
>> >
>> >
>> > Hi Han and Lei,
>> >
>> > thank you for the patch, I have a couple of comments/questions down
below.
>>
>>
>> Thanks Ales.
>>
>> >
>> >
>> >>  NEWS                            |   3 +
>> >>  controller/chassis.c            |   2 +-
>> >>  controller/local_data.c         |   2 +-
>> >>  controller/local_data.h         |   2 +-
>> >>  controller/ovn-controller.8.xml |  30 ++++++-
>> >>  controller/ovn-controller.c     |  49 ++++++++++++
>> >>  controller/physical.c           | 134
++++++++++++++++++++++----------
>> >>  controller/physical.h           |   2 +
>> >>  include/ovn/logical-fields.h    |   4 +-
>> >>  ovn-architecture.7.xml          |  18 ++++-
>> >>  tests/ovn.at                    |  51 +++++++++++-
>> >>  11 files changed, 243 insertions(+), 54 deletions(-)
>> >>
>> >> diff --git a/NEWS b/NEWS
>> >> index 5f267b4c64cc..5a3eed608617 100644
>> >> --- a/NEWS
>> >> +++ b/NEWS
>> >> @@ -14,6 +14,9 @@ Post v23.09.0
>> >>    - ovn-northd-ddlog has been removed.
>> >>    - A new LSP option "enable_router_port_acl" has been added to
enable
>> >>      conntrack for the router port whose peer is l3dgw_port if set it
true.
>> >> +  - Support selecting encapsulation IP based on the
source/destination VIF's
>> >> +    settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for
more
>> >> +    details.
>> >>
>> >>  OVN v23.09.0 - 15 Sep 2023
>> >>  --------------------------
>> >> diff --git a/controller/chassis.c b/controller/chassis.c
>> >> index a6f13ccc42d5..55f2beb37674 100644
>> >> --- a/controller/chassis.c
>> >> +++ b/controller/chassis.c
>> >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg {
>> >>
>> >>      /* Set of encap types parsed from the 'ovn-encap-type'
external-id. */
>> >>      struct sset encap_type_set;
>> >> -    /* Set of encap IPs parsed from the 'ovn-encap-type'
external-id. */
>> >> +    /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id.
*/
>> >>      struct sset encap_ip_set;
>> >>      /* Interface type list formatted in the OVN-SB Chassis required
format. */
>> >>      struct ds iface_types;
>> >> diff --git a/controller/local_data.c b/controller/local_data.c
>> >> index a9092783958f..8606414f8728 100644
>> >> --- a/controller/local_data.c
>> >> +++ b/controller/local_data.c
>> >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap
*chassis_tunnels)
>> >>   */
>> >>  struct chassis_tunnel *
>> >>  chassis_tunnel_find(const struct hmap *chassis_tunnels, const char
*chassis_id,
>> >> -                    char *remote_encap_ip, char *local_encap_ip)
>> >> +                    char *remote_encap_ip, const char
*local_encap_ip)
>> >
>> >
>> > nit: Unrelated change.
>>
>>
>> Ack

Hi Ales, sorry that I just realized this change is related, which is
because of the const char * array introduced in this patch that stores the
parsed encap_ips, and it makes sense to use const char * since we should
never expect this string to be modified in the function.

>>
>> >
>> >
>> >>  {
>> >>      /*
>> >>       * If the specific encap_ip is given, look for the chassisid_ip
entry,
>> >> diff --git a/controller/local_data.h b/controller/local_data.h
>> >> index bab95bcc3824..ca3905bd20e6 100644
>> >> --- a/controller/local_data.h
>> >> +++ b/controller/local_data.h
>> >> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes(
>> >>  struct chassis_tunnel *chassis_tunnel_find(const struct hmap
*chassis_tunnels,
>> >>                                             const char *chassis_id,
>> >>                                             char *remote_encap_ip,
>> >> -                                           char *local_encap_ip);
>> >> +                                           const char
*local_encap_ip);
>> >
>> >
>> > Same as above.
>>
>>
>> Ack
>>
>> >
>> >
>> >>
>> >>  bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels,
>> >>                                 const char *chassis_name,
>> >> diff --git a/controller/ovn-controller.8.xml
b/controller/ovn-controller.8.xml
>> >> index efa65e3fd927..5ebef048d721 100644
>> >> --- a/controller/ovn-controller.8.xml
>> >> +++ b/controller/ovn-controller.8.xml
>> >> @@ -176,10 +176,32 @@
>> >>
>> >>        <dt><code>external_ids:ovn-encap-ip</code></dt>
>> >>        <dd>
>> >> -        The IP address that a chassis should use to connect to this
node
>> >> -        using encapsulation types specified by
>> >> -        <code>external_ids:ovn-encap-type</code>. Multiple
encapsulation IPs
>> >> -        may be specified with a comma-separated list.
>> >> +        <p>
>> >> +          The IP address that a chassis should use to connect to
this node
>> >> +          using encapsulation types specified by
>> >> +          <code>external_ids:ovn-encap-type</code>. Multiple
encapsulation IPs
>> >> +          may be specified with a comma-separated list.
>> >> +        </p>
>> >> +        <p>
>> >> +          In scenarios where multiple encapsulation IPs are present,
distinct
>> >> +          tunnels are established for each remote chassis. These
tunnels are
>> >> +          differentiated by setting unique
<code>options:local_ip</code> and
>> >> +          <code>options:remote_ip</code> values in the tunnel
interface. When
>> >> +          transmitting a packet to a remote chassis, the selection
of local_ip
>> >> +          is guided by the
<code>Interface:external_ids:encap-ip</code> from
>> >> +          the local OVSDB, corresponding to the VIF originating the
packet, if
>> >> +          specified. The
<code>Interface:external_ids:encap-ip</code> setting
>> >> +          of the VIF is also populated to the
<code>Port_Binding</code>
>> >> +          table in the OVN SB database via the <code>encap</code>
column.
>> >> +          Consequently, when a remote chassis needs to send a packet
to a
>> >> +          port-binding associated with this VIF, it utilizes the
tunnel with
>> >> +          the appropriate <code>options:remote_ip</code> that
matches the
>> >> +          <code>ip</code> in <code>Port_Binding:encap</code>. This
mechanism
>> >> +          is particularly beneficial for chassis with multiple
physical
>> >> +          interfaces designated for tunneling, where each interface
is
>> >> +          optimized for handling specific traffic associated with
particular
>> >> +          VIFs.
>> >> +        </p>
>> >>        </dd>
>> >>
>> >>        <dt><code>external_ids:ovn-encap-df_default</code></dt>
>> >> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
>> >> index 856e5e270822..4ab57fe970fe 100644
>> >> --- a/controller/ovn-controller.c
>> >> +++ b/controller/ovn-controller.c
>> >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output {
>> >>      struct physical_debug debug;
>> >>  };
>> >>
>> >> +static void
>> >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table,
>> >> +                size_t *n_encap_ips, const char * **encap_ips)
>> >> +{
>> >> +    const struct ovsrec_open_vswitch *cfg =
>> >> +        ovsrec_open_vswitch_table_first(ovs_table);
>> >> +    const char *chassis_id = get_ovs_chassis_id(ovs_table);
>> >> +    const char *encap_ips_str =
>> >> +        get_chassis_external_id_value(&cfg->external_ids, chassis_id,
>> >> +                                      "ovn-encap-ip", NULL);
>> >> +    struct sset encap_ip_set;
>> >> +    sset_init(&encap_ip_set);
>> >> +    sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");
>> >> +
>> >> +    /* Sort the ips so that their index is deterministic. */
>> >> +    *encap_ips = sset_sort(&encap_ip_set);
>> >> +
>> >> +    /* Copy the strings so that we can destroy the sset. */
>> >> +    for (size_t i = 0; (*encap_ips)[i]; i++) {
>> >> +        (*encap_ips)[i] = xstrdup((*encap_ips)[i]);
>> >> +    }
>> >> +    *n_encap_ips = sset_count(&encap_ip_set);
>> >> +    sset_destroy(&encap_ip_set);
>> >> +}
>> >> +
>> >
>> >
>> > I wonder, could we store this array or maybe preferably svec in
"en_non_vif_data" I-P node? This way it would be handled in the node and we
could avoid the destroy calls after any pflow processing WDYT?
>>
>>
>> Yes we could do the same in en_non_vif_data, but I think it is not
really necessary and it seems more straightforward just parsing it here,
because:
>> 1. We don't need I-P for this ovn-encap-ip configuration change, so we
don't have to persist this data.
>> 2. It should be very rare to have more than 5 (or even 3) encap IPs per
node, so the list should be very small and the cost of this parsing (and
sset construct/destroy) is negligible.
>> Using svec is also a valid option, but the sset (and array) is used here
just to reuse the sset_from_delimited_string and sset_sort for convenience.
I noticed that the sset_init() call can in fact be removed because
sset_from_delimited_string already does that. I can remove it.
>> Does this sound reasonable to you?
>
>
> It makes sense, the main thing it would help with is the need to destroy
the ctx in kinda unexpected places which might be slightly error prone.
However, it being functionally the same it's fine either way.
>

Thanks Ales. So I will add the below small change. Would you give your Ack
if it looks good to you?
-----------------------
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 4ab57fe970fe..6873c02288c6 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -4532,7 +4532,6 @@ parse_encap_ips(const struct
ovsrec_open_vswitch_table *ovs_table,
         get_chassis_external_id_value(&cfg->external_ids, chassis_id,
                                       "ovn-encap-ip", NULL);
     struct sset encap_ip_set;
-    sset_init(&encap_ip_set);
     sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");

     /* Sort the ips so that their index is deterministic. */
@@ -4615,8 +4614,6 @@ static void init_physical_ctx(struct engine_node
*node,
     p_ctx->patch_ofports = &non_vif_data->patch_ofports;
     p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;

-
-
     struct controller_engine_ctx *ctrl_ctx =
engine_get_context()->client_ctx;
     p_ctx->if_mgr = ctrl_ctx->if_mgr;
-------------------------

Thanks,
Han


> Thanks,
> Ales
>
>>
>>
>> Thanks,
>> Han
>>
>> >
>> >>
>> >>  static void init_physical_ctx(struct engine_node *node,
>> >>                                struct ed_type_runtime_data *rt_data,
>> >>                                struct ed_type_non_vif_data
*non_vif_data,
>> >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct
engine_node *node,
>> >>          engine_get_input_data("ct_zones", node);
>> >>      struct simap *ct_zones = &ct_zones_data->current;
>> >>
>> >> +    parse_encap_ips(ovs_table, &p_ctx->n_encap_ips,
&p_ctx->encap_ips);
>> >>      p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name;
>> >>      p_ctx->sbrec_port_binding_by_datapath =
sbrec_port_binding_by_datapath;
>> >>      p_ctx->port_binding_table = port_binding_table;
>> >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct
engine_node *node,
>> >>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
>> >>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
>> >>
>> >> +
>> >> +
>> >>
>> >
>> > nit: Unrelated change.
>>
>>
>> Ack
>> >
>> >
>> >>
>> >>      struct controller_engine_ctx *ctrl_ctx =
engine_get_context()->client_ctx;
>> >>      p_ctx->if_mgr = ctrl_ctx->if_mgr;
>> >>
>> >>      pflow_output_get_debug(node, &p_ctx->debug);
>> >>  }
>> >>
>> >> +static void
>> >> +destroy_physical_ctx(struct physical_ctx *p_ctx)
>> >> +{
>> >> +    for (size_t i = 0; i < p_ctx->n_encap_ips; i++) {
>> >> +        free((char *)(p_ctx->encap_ips[i]));
>> >> +    }
>> >> +    free(p_ctx->encap_ips);
>> >> +}
>> >> +
>> >>  static void *
>> >>  en_pflow_output_init(struct engine_node *node OVS_UNUSED,
>> >>                               struct engine_arg *arg OVS_UNUSED)
>> >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node,
void *data)
>> >>      struct physical_ctx p_ctx;
>> >>      init_physical_ctx(node, rt_data, non_vif_data, &p_ctx);
>> >>      physical_run(&p_ctx, pflow_table);
>> >> +    destroy_physical_ctx(&p_ctx);
>> >>
>> >>      engine_set_node_state(node, EN_UPDATED);
>> >>  }
>> >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct
engine_node *node,
>> >>                  bool removed =
sbrec_port_binding_is_deleted(binding);
>> >>                  if (!physical_handle_flows_for_lport(binding,
removed, &p_ctx,
>> >>
&pfo->flow_table)) {
>> >> +                    destroy_physical_ctx(&p_ctx);
>> >>                      return false;
>> >>                  }
>> >>              }
>> >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct
engine_node *node,
>> >>              bool removed = sbrec_port_binding_is_deleted(pb);
>> >>              if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
>> >>                                                   &pfo->flow_table)) {
>> >> +                destroy_physical_ctx(&p_ctx);
>> >>                  return false;
>> >>              }
>> >>          }
>> >>          engine_set_node_state(node, EN_UPDATED);
>> >>      }
>> >> +    destroy_physical_ctx(&p_ctx);
>> >>      return true;
>> >>  }
>> >>
>> >> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct
engine_node *node,
>> >>          bool removed = sbrec_port_binding_is_deleted(pb);
>> >>          if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
>> >>                                               &pfo->flow_table)) {
>> >> +            destroy_physical_ctx(&p_ctx);
>> >>              return false;
>> >>          }
>> >>      }
>> >>
>> >>      engine_set_node_state(node, EN_UPDATED);
>> >> +    destroy_physical_ctx(&p_ctx);
>> >>      return true;
>> >>  }
>> >>
>> >> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct
engine_node *node, void *data)
>> >>      physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table);
>> >>
>> >>      engine_set_node_state(node, EN_UPDATED);
>> >> +    destroy_physical_ctx(&p_ctx);
>> >>      return true;
>> >>  }
>> >>
>> >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct
engine_node *node, void *data)
>> >>          if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) {
>> >>              /* Fall back to full recompute when a local datapath
>> >>               * is added or deleted. */
>> >> +            destroy_physical_ctx(&p_ctx);
>> >>              return false;
>> >>          }
>> >>
>> >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct
engine_node *node, void *data)
>> >>                  lport->tracked_type == TRACKED_RESOURCE_REMOVED ?
true: false;
>> >>              if (!physical_handle_flows_for_lport(lport->pb, removed,
&p_ctx,
>> >>                                                   &pfo->flow_table)) {
>> >> +                destroy_physical_ctx(&p_ctx);
>> >>                  return false;
>> >>              }
>> >>          }
>> >>      }
>> >>
>> >>      engine_set_node_state(node, EN_UPDATED);
>> >> +    destroy_physical_ctx(&p_ctx);
>> >>      return true;
>> >>  }
>> >>
>> >> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct
engine_node *node, void *data)
>> >>          if (pb) {
>> >>              if (!physical_handle_flows_for_lport(pb, false, &p_ctx,
>> >>                                                   &pfo->flow_table)) {
>> >> +                destroy_physical_ctx(&p_ctx);
>> >>                  return false;
>> >>              }
>> >>              tag_port_as_activated_in_engine(pp);
>> >>          }
>> >>      }
>> >>      engine_set_node_state(node, EN_UPDATED);
>> >> +    destroy_physical_ctx(&p_ctx);
>> >>      return true;
>> >>  }
>> >>
>> >> diff --git a/controller/physical.c b/controller/physical.c
>> >> index e93bfbd2cffb..c192aed751d5 100644
>> >> --- a/controller/physical.c
>> >> +++ b/controller/physical.c
>> >> @@ -71,6 +71,8 @@ struct tunnel {
>> >>  static void
>> >>  load_logical_ingress_metadata(const struct sbrec_port_binding
*binding,
>> >>                                const struct zone_ids *zone_ids,
>> >> +                              size_t n_encap_ips,
>> >> +                              const char **encap_ips,
>> >>                                struct ofpbuf *ofpacts_p);
>> >>  static int64_t get_vxlan_port_key(int64_t port_key);
>> >>
>> >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf
*ofpacts)
>> >>   */
>> >>  static struct chassis_tunnel *
>> >>  get_port_binding_tun(const struct sbrec_encap *remote_encap,
>> >> -                     const struct sbrec_encap *local_encap,
>> >>                       const struct sbrec_chassis *chassis,
>> >> -                     const struct hmap *chassis_tunnels)
>> >> +                     const struct hmap *chassis_tunnels,
>> >> +                     const char *local_encap_ip)
>> >>  {
>> >>      struct chassis_tunnel *tun = NULL;
>> >>      tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
>> >>                                remote_encap ? remote_encap->ip : NULL,
>> >> -                              local_encap ? local_encap->ip : NULL);
>> >> +                              local_encap_ip);
>> >>
>> >>      if (!tun) {
>> >>          tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
NULL, NULL);
>> >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct
sbrec_port_binding *pb,
>> >>  static struct ovs_list *
>> >>  get_remote_tunnels(const struct sbrec_port_binding *binding,
>> >>                     const struct sbrec_chassis *chassis,
>> >> -                   const struct hmap *chassis_tunnels)
>> >> +                   const struct hmap *chassis_tunnels,
>> >> +                   const char *local_encap_ip)
>> >>  {
>> >>      const struct chassis_tunnel *tun;
>> >>
>> >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct
sbrec_port_binding *binding,
>> >>      ovs_list_init(tunnels);
>> >>
>> >>      if (binding->chassis && binding->chassis != chassis) {
>> >> -        tun = get_port_binding_tun(binding->encap, NULL,
binding->chassis,
>> >> -                                   chassis_tunnels);
>> >> +        tun = get_port_binding_tun(binding->encap, binding->chassis,
>> >> +                chassis_tunnels, local_encap_ip);
>> >>          if (!tun) {
>> >>              static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
>> >>              VLOG_WARN_RL(
>> >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct
sbrec_port_binding *binding,
>> >>          }
>> >>          const struct sbrec_encap *additional_encap;
>> >>          additional_encap =
find_additional_encap_for_chassis(binding, chassis);
>> >> -        tun = get_port_binding_tun(additional_encap, NULL,
>> >> +        tun = get_port_binding_tun(additional_encap,
>> >>                                     binding->additional_chassis[i],
>> >> -                                   chassis_tunnels);
>> >> +                                   chassis_tunnels, local_encap_ip);
>> >>          if (!tun) {
>> >>              static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
>> >>              VLOG_WARN_RL(
>> >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct
sbrec_port_binding *binding,
>> >>                                   struct ofpbuf *ofpacts_p,
>> >>                                   const struct sbrec_chassis *chassis,
>> >>                                   const struct hmap *chassis_tunnels,
>> >> +                                 size_t n_encap_ips,
>> >> +                                 const char **encap_ips,
>> >>                                   struct ovn_desired_flow_table
*flow_table)
>> >>  {
>> >>      /* Setup encapsulation */
>> >> -    struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
>> >> -                                               chassis_tunnels);
>> >> -    if (!ovs_list_is_empty(tuns)) {
>> >> -        bool is_vtep_port = !strcmp(binding->type, "vtep");
>> >> -        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for
ARP/ND
>> >> -         * responder in L3 networks. */
>> >> -        if (is_vtep_port) {
>> >> -            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
ofpacts_p);
>> >> -        }
>> >> +    for (size_t i = 0; i < n_encap_ips; i++) {
>> >> +        struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p);
>> >>
>> >> +
>> >> +        match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i
<< 16,
>> >> +                             (uint32_t) 0xFFFF << 16);
>> >> +        struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
>> >> +                                                   chassis_tunnels,
>> >> +                                                   encap_ips[i]);
>> >> +        if (!ovs_list_is_empty(tuns)) {
>> >> +            bool is_vtep_port = !strcmp(binding->type, "vtep");
>> >> +            /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check
for ARP/ND
>> >> +             * responder in L3 networks. */
>> >> +            if (is_vtep_port) {
>> >> +                put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
>> >> +                         ofpacts_clone);
>> >> +            }
>> >>
>> >> -        struct tunnel *tun;
>> >> -        LIST_FOR_EACH (tun, list_node, tuns) {
>> >> -            put_encapsulation(mff_ovn_geneve, tun->tun,
>> >> -                              binding->datapath, port_key,
is_vtep_port,
>> >> -                              ofpacts_p);
>> >> -            ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport;
>> >> +            struct tunnel *tun;
>> >> +            LIST_FOR_EACH (tun, list_node, tuns) {
>> >> +                put_encapsulation(mff_ovn_geneve, tun->tun,
>> >> +                                  binding->datapath, port_key,
is_vtep_port,
>> >> +                                  ofpacts_clone);
>> >> +                ofpact_put_OUTPUT(ofpacts_clone)->port =
tun->tun->ofport;
>> >> +            }
>> >> +            put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone);
>> >> +            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
>> >> +                            binding->header_.uuid.parts[0], match,
>> >> +                            ofpacts_clone, &binding->header_.uuid);
>> >>          }
>> >> -        put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
>> >> -        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
>> >> -                        binding->header_.uuid.parts[0], match,
ofpacts_p,
>> >> -                        &binding->header_.uuid);
>> >> -    }
>> >> -    struct tunnel *tun_elem;
>> >> -    LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
>> >> -        free(tun_elem);
>> >> +        struct tunnel *tun_elem;
>> >> +        LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
>> >> +            free(tun_elem);
>> >> +        }
>> >> +        free(tuns);
>> >> +
>> >> +        ofpbuf_delete(ofpacts_clone);
>> >>      }
>> >> -    free(tuns);
>> >>  }
>> >>
>> >>  static void
>> >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap
*ct_zones,
>> >>          if (tag) {
>> >>              ofpact_put_STRIP_VLAN(ofpacts_p);
>> >>          }
>> >> -        load_logical_ingress_metadata(localnet_port, &zone_ids,
ofpacts_p);
>> >> +        load_logical_ingress_metadata(localnet_port, &zone_ids, 0,
NULL,
>> >> +                                      ofpacts_p);
>> >>          replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p);
>> >>          replace_mac->mac = router_port_mac;
>> >>
>> >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids
*zone_ids, struct ofpbuf *ofpacts_p)
>> >>  {
>> >>      if (zone_ids) {
>> >>          if (zone_ids->ct) {
>> >> -            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32,
ofpacts_p);
>> >> +            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16,
ofpacts_p);
>> >>          }
>> >>          if (zone_ids->dnat) {
>> >>              put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32,
ofpacts_p);
>> >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key,
>> >>      }
>> >>  }
>> >>
>> >> +static size_t
>> >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips,
>> >> +               const char *ip)
>> >> +{
>> >> +    for (size_t i = 0; i < n_encap_ips; i++) {
>> >> +        if (!strcmp(ip, encap_ips[i])) {
>> >> +            return i;
>> >> +        }
>> >> +    }
>> >> +    return 0;
>> >> +}
>> >> +
>> >>  static void
>> >>  load_logical_ingress_metadata(const struct sbrec_port_binding
*binding,
>> >>                                const struct zone_ids *zone_ids,
>> >> +                              size_t n_encap_ips,
>> >> +                              const char **encap_ips,
>> >>                                struct ofpbuf *ofpacts_p)
>> >>  {
>> >>      put_zones_ofpacts(zone_ids, ofpacts_p);
>> >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct
sbrec_port_binding *binding,
>> >>      uint32_t port_key = binding->tunnel_key;
>> >>      put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p);
>> >>      put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p);
>> >> +
>> >> +    /* Default encap_id 0. */
>> >> +    size_t encap_id = 0;
>> >> +    if (encap_ips && binding->encap) {
>> >> +        encap_id = encap_ip_to_id(n_encap_ips, encap_ips,
>> >> +                                  binding->encap->ip);
>> >> +    }
>> >> +    put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p);
>> >>  }
>> >>
>> >>  static const struct sbrec_port_binding *
>> >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct
sbrec_port_binding *binding,
>> >>      match_set_dl_type(&match, htons(ETH_TYPE_RARP));
>> >>      match_set_in_port(&match, ofport);
>> >>
>> >> -    load_logical_ingress_metadata(binding, zone_ids, &ofpacts);
>> >> +    load_logical_ingress_metadata(binding, zone_ids, 0, NULL,
&ofpacts);
>> >>
>> >>      encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
>> >>                           NX_CTLR_NO_METER, &ofpacts);
>> >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports(
>> >>      }
>> >>
>> >>      struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
>> >> -                                               chassis_tunnels);
>> >> +                                               chassis_tunnels,
NULL);
>> >>      if (ovs_list_is_empty(tuns)) {
>> >>          free(tuns);
>> >>          return;
>> >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >>                        const struct sbrec_chassis *chassis,
>> >>                        const struct physical_debug *debug,
>> >>                        const struct if_status_mgr *if_mgr,
>> >> +                      size_t n_encap_ips,
>> >> +                      const char **encap_ips,
>> >>                        struct ovn_desired_flow_table *flow_table,
>> >>                        struct ofpbuf *ofpacts_p)
>> >>  {
>> >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >>          ofpact_put_CT_CLEAR(ofpacts_p);
>> >>          put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
>> >>          put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
>> >> -        put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
>> >> +        put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
>> >>          struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
>> >> -        load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
>> >> +        load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips,
>> >> +                                      encap_ips, ofpacts_p);
>> >>          put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
>> >>          put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
>> >>          for (int i = 0; i < MFF_N_LOG_REGS; i++) {
>> >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >>           * as we're going to remove this with ofpbuf_pull() later. */
>> >>          uint32_t ofpacts_orig_size = ofpacts_p->size;
>> >>
>> >> -        load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p);
>> >> +        load_logical_ingress_metadata(binding, &zone_ids,
n_encap_ips,
>> >> +                                      encap_ips, ofpacts_p);
>> >>
>> >>          if (!strcmp(binding->type, "localport")) {
>> >>              /* mark the packet as incoming from a localport */
>> >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >>      } else {
>> >>          put_remote_port_redirect_overlay(
>> >>              binding, mff_ovn_geneve, port_key, &match, ofpacts_p,
>> >> -            chassis, chassis_tunnels, flow_table);
>> >> +            chassis, chassis_tunnels, n_encap_ips, encap_ips,
flow_table);
>> >>      }
>> >>  out:
>> >>      if (ha_ch_ordered) {
>> >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >>
>> >>          int zone_id = simap_get(ct_zones, port->logical_port);
>> >>          if (zone_id) {
>> >> -            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts);
>> >> +            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts);
>> >>          }
>> >>
>> >>          const char *lport_name = (port->parent_port &&
*port->parent_port) ?
>> >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct physical_ctx
*p_ctx,
>> >>                            p_ctx->patch_ofports,
>> >>                            p_ctx->chassis_tunnels,
>> >>                            pb, p_ctx->chassis, &p_ctx->debug,
>> >> -                          p_ctx->if_mgr, flow_table, &ofpacts);
>> >> +                          p_ctx->if_mgr,
>> >> +                          p_ctx->n_encap_ips,
>> >> +                          p_ctx->encap_ips,
>> >> +                          flow_table, &ofpacts);
>> >>      ofpbuf_uninit(&ofpacts);
>> >>  }
>> >>
>> >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx,
>> >>                                p_ctx->patch_ofports,
>> >>                                p_ctx->chassis_tunnels, binding,
>> >>                                p_ctx->chassis, &p_ctx->debug,
>> >> -                              p_ctx->if_mgr, flow_table, &ofpacts);
>> >> +                              p_ctx->if_mgr,
>> >> +                              p_ctx->n_encap_ips,
>> >> +                              p_ctx->encap_ips,
>> >> +                              flow_table, &ofpacts);
>> >>      }
>> >>
>> >>      /* Handle output to multicast groups, in tables 40 and 41. */
>> >> diff --git a/controller/physical.h b/controller/physical.h
>> >> index 1f1ed55efa16..7fe8ee3c18ed 100644
>> >> --- a/controller/physical.h
>> >> +++ b/controller/physical.h
>> >> @@ -66,6 +66,8 @@ struct physical_ctx {
>> >>      struct shash *local_bindings;
>> >>      struct simap *patch_ofports;
>> >>      struct hmap *chassis_tunnels;
>> >> +    size_t n_encap_ips;
>> >> +    const char **encap_ips;
>> >>      struct physical_debug debug;
>> >>  };
>> >>
>> >> diff --git a/include/ovn/logical-fields.h
b/include/ovn/logical-fields.h
>> >> index 272277ec4fa0..d3455a462a0d 100644
>> >> --- a/include/ovn/logical-fields.h
>> >> +++ b/include/ovn/logical-fields.h
>> >> @@ -37,7 +37,9 @@ enum ovn_controller_event {
>> >>  #define MFF_LOG_SNAT_ZONE  MFF_REG12  /* conntrack snat zone for
gateway router
>> >>                                         * (32 bits). */
>> >>  #define MFF_LOG_CT_ZONE    MFF_REG13  /* Logical conntrack zone for
lports
>> >> -                                       * (32 bits). */
>> >> +                                       * (0..15 of the 32 bits). */
>> >> +#define MFF_LOG_ENCAP_ID   MFF_REG13  /* Encap ID for lports
>> >> +                                       * (16..31 of the 32 bits). */
>> >>  #define MFF_LOG_INPORT     MFF_REG14  /* Logical input port (32
bits). */
>> >>  #define MFF_LOG_OUTPORT    MFF_REG15  /* Logical output port (32
bits). */
>> >>
>> >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
>> >> index 96294fe2b795..bfd8680cedfc 100644
>> >> --- a/ovn-architecture.7.xml
>> >> +++ b/ovn-architecture.7.xml
>> >> @@ -1247,8 +1247,8 @@
>> >>        chassis.  This is initialized to 0 at the beginning of the
logical
>> >>          <!-- Keep the following in sync with MFF_LOG_CT_ZONE in
>> >>               ovn/lib/logical-fields.h. -->
>> >> -      ingress pipeline.  OVN stores this in Open vSwitch extension
register
>> >> -      number 13.
>> >> +      ingress pipeline.  OVN stores this in the lower 16 bits of the
Open
>> >> +      vSwitch extension register number 13.
>> >>      </dd>
>> >>
>> >>      <dt>conntrack zone fields for routers</dt>
>> >> @@ -1263,6 +1263,20 @@
>> >>        traffic (for SNATing) in Open vSwitch extension register
number 12.
>> >>      </dd>
>> >>
>> >> +    <dt>Encap ID for logical ports</dt>
>> >> +    <dd>
>> >> +      A field that records an ID that indicates which encapsulation
IP should
>> >> +      be used when sending packets to a remote chassis, according to
the
>> >> +      original input logical port. This is useful when there are
multiple IPs
>> >> +      available for encapsulation. The value only has local
significance and is
>> >> +      not meaningful between chassis. This is initialized to 0 at
the beginning
>> >> +      of the logical
>> >> +        <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in
>> >> +             ovn/lib/logical-fields.h. -->
>> >> +      ingress pipeline.  OVN stores this in the higher 16 bits of
the Open
>> >> +      vSwitch extension register number 13.
>> >> +    </dd>
>> >> +
>> >>      <dt>logical flow flags</dt>
>> >>      <dd>
>> >>        The logical flags are intended to handle keeping context
between
>> >> diff --git a/tests/ovn.at b/tests/ovn.at
>> >> index 243fe0b8246c..e7f0c9681f60 100644
>> >> --- a/tests/ovn.at
>> >> +++ b/tests/ovn.at
>> >> @@ -30335,19 +30335,33 @@ AT_CLEANUP
>> >>
>> >>
>> >>  OVN_FOR_EACH_NORTHD([
>> >> -AT_SETUP([multiple encap ips tunnel creation])
>> >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip])
>> >> +AT_SKIP_IF([test $HAVE_SCAPY = no])
>> >>  ovn_start
>> >>  net_add n1
>> >>
>> >> +ovn-nbctl ls-add ls1
>> >> +
>> >>  # 2 HVs, each with 2 encap-ips.
>> >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY.
>> >>  for i in 1 2; do
>> >>      sim_add hv$i
>> >>      as hv$i
>> >>      ovs-vsctl add-br br-phys-$j
>> >>      ovn_attach n1 br-phys-$j 192.168.0.${i}1
>> >>      ovs-vsctl set open .
external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2
>> >> +
>> >> +    for j in 1 2; do
>> >> +        ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \
>> >> +            external_ids:iface-id=lsp$i$j \
>> >> +            external_ids:encap-ip=192.168.0.$i$j \
>> >> +            options:tx_pcap=hv$i/vif$i$j-tx.pcap
options:rxq_pcap=hv$i/vif$i$j-rx.pcap
>> >> +        ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j
"f0:00:00:00:00:$i$j 10.0.0.$i$j"
>> >> +
>> >> +    done
>> >>  done
>> >>
>> >> +wait_for_ports_up
>> >>  check ovn-nbctl --wait=hv sync
>> >>
>> >>  check_tunnel_port() {
>> >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int hv1@192.168.0.12
%192.168.0.21
>> >>  check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22
>> >>  check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22
>> >>
>> >> +vif_to_hv() {
>> >> +    case $1 in dnl (
>> >> +        vif[[1]]?) echo hv1 ;; dnl (
>> >> +        vif[[2]]?) echo hv2 ;; dnl (
>> >> +        *) AT_FAIL_IF([:]) ;;
>> >> +    esac
>> >> +}
>> >> +
>> >> +# check_packet_tunnel SRC_VIF DST_VIF
>> >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the
corresponding
>> >> +# tunnel that matches the VIFs' encap_ip configurations.
>> >> +check_packet_tunnel() {
>> >> +    local src=$1 dst=$2
>> >> +    local src_mac=f0:00:00:00:00:$src
>> >> +    local dst_mac=f0:00:00:00:00:$dst
>> >> +    local src_ip=10.0.0.$src
>> >> +    local dst_ip=10.0.0.$dst
>> >> +    local local_encap_ip=192.168.0.$src
>> >> +    local remote_encap_ip=192.168.0.$dst
>> >> +    local packet=$(fmt_pkt "Ether(dst='${dst_mac}',
src='${src_mac}')/ \
>> >> +                            IP(dst='${dst_ip}', src='${src_ip}')/ \
>> >> +                            ICMP(type=8)")
>> >> +    hv=`vif_to_hv vif$src`
>> >> +    as $hv
>> >> +    echo "vif$src -> vif$dst should go through tunnel
$local_encap_ip -> $remote_encap_ip"
>> >> +    tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface
options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip)
>> >> +    AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src
$packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport])
>> >> +}
>> >> +
>> >> +for i in 1 2; do
>> >> +    for j in 1 2; do
>> >> +        check_packet_tunnel 1$i 2$j
>> >> +    done
>> >> +done
>> >> +
>> >>  OVN_CLEANUP([hv1],[hv2])
>> >>  AT_CLEANUP
>> >>  ])
>> >> --
>> >> 2.38.1
>> >>
>> >> _______________________________________________
>> >> dev mailing list
>> >> dev@openvswitch.org
>> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>> >>
>> >
>> > Thanks,
>> > Ales
>> >
>> > --
>> >
>> > Ales Musil
>> >
>> > Senior Software Engineer - OVN Core
>> >
>> > Red Hat EMEA
>> >
>> > amusil@redhat.com
>
>
>
> --
>
> Ales Musil
>
> Senior Software Engineer - OVN Core
>
> Red Hat EMEA
>
> amusil@redhat.com
Ales Musil Jan. 29, 2024, 10:40 a.m. UTC | #5
On Fri, Jan 26, 2024 at 8:05 PM Han Zhou <hzhou@ovn.org> wrote:

>
>
> On Thu, Jan 25, 2024 at 10:54 PM Ales Musil <amusil@redhat.com> wrote:
> >
> >
> >
> > On Fri, Jan 26, 2024 at 4:07 AM Han Zhou <hzhou@ovn.org> wrote:
> >>
> >>
> >>
> >> On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote:
> >> >
> >> >
> >> >
> >> > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote:
> >> >>
> >> >> Commit dd527a283cd8 partially supported multiple encap IPs. It
> supported
> >> >> remote encap IP selection based on the destination VIF's encap_ip
> >> >> configuration. This patch adds the support for selecting local encap
> IP
> >> >> based on the source VIF's encap_ip configuration.
> >> >>
> >> >> Co-authored-by: Lei Huang <leih@nvidia.com>
> >> >> Signed-off-by: Lei Huang <leih@nvidia.com>
> >> >> Signed-off-by: Han Zhou <hzhou@ovn.org>
> >> >> ---
> >> >
> >> >
> >> > Hi Han and Lei,
> >> >
> >> > thank you for the patch, I have a couple of comments/questions down
> below.
> >>
> >>
> >> Thanks Ales.
> >>
> >> >
> >> >
> >> >>  NEWS                            |   3 +
> >> >>  controller/chassis.c            |   2 +-
> >> >>  controller/local_data.c         |   2 +-
> >> >>  controller/local_data.h         |   2 +-
> >> >>  controller/ovn-controller.8.xml |  30 ++++++-
> >> >>  controller/ovn-controller.c     |  49 ++++++++++++
> >> >>  controller/physical.c           | 134
> ++++++++++++++++++++++----------
> >> >>  controller/physical.h           |   2 +
> >> >>  include/ovn/logical-fields.h    |   4 +-
> >> >>  ovn-architecture.7.xml          |  18 ++++-
> >> >>  tests/ovn.at                    |  51 +++++++++++-
> >> >>  11 files changed, 243 insertions(+), 54 deletions(-)
> >> >>
> >> >> diff --git a/NEWS b/NEWS
> >> >> index 5f267b4c64cc..5a3eed608617 100644
> >> >> --- a/NEWS
> >> >> +++ b/NEWS
> >> >> @@ -14,6 +14,9 @@ Post v23.09.0
> >> >>    - ovn-northd-ddlog has been removed.
> >> >>    - A new LSP option "enable_router_port_acl" has been added to
> enable
> >> >>      conntrack for the router port whose peer is l3dgw_port if set
> it true.
> >> >> +  - Support selecting encapsulation IP based on the
> source/destination VIF's
> >> >> +    settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for
> more
> >> >> +    details.
> >> >>
> >> >>  OVN v23.09.0 - 15 Sep 2023
> >> >>  --------------------------
> >> >> diff --git a/controller/chassis.c b/controller/chassis.c
> >> >> index a6f13ccc42d5..55f2beb37674 100644
> >> >> --- a/controller/chassis.c
> >> >> +++ b/controller/chassis.c
> >> >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg {
> >> >>
> >> >>      /* Set of encap types parsed from the 'ovn-encap-type'
> external-id. */
> >> >>      struct sset encap_type_set;
> >> >> -    /* Set of encap IPs parsed from the 'ovn-encap-type'
> external-id. */
> >> >> +    /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id.
> */
> >> >>      struct sset encap_ip_set;
> >> >>      /* Interface type list formatted in the OVN-SB Chassis required
> format. */
> >> >>      struct ds iface_types;
> >> >> diff --git a/controller/local_data.c b/controller/local_data.c
> >> >> index a9092783958f..8606414f8728 100644
> >> >> --- a/controller/local_data.c
> >> >> +++ b/controller/local_data.c
> >> >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap
> *chassis_tunnels)
> >> >>   */
> >> >>  struct chassis_tunnel *
> >> >>  chassis_tunnel_find(const struct hmap *chassis_tunnels, const char
> *chassis_id,
> >> >> -                    char *remote_encap_ip, char *local_encap_ip)
> >> >> +                    char *remote_encap_ip, const char
> *local_encap_ip)
> >> >
> >> >
> >> > nit: Unrelated change.
> >>
> >>
> >> Ack
>
> Hi Ales, sorry that I just realized this change is related, which is
> because of the const char * array introduced in this patch that stores the
> parsed encap_ips, and it makes sense to use const char * since we should
> never expect this string to be modified in the function.
>
> >>
> >> >
> >> >
> >> >>  {
> >> >>      /*
> >> >>       * If the specific encap_ip is given, look for the chassisid_ip
> entry,
> >> >> diff --git a/controller/local_data.h b/controller/local_data.h
> >> >> index bab95bcc3824..ca3905bd20e6 100644
> >> >> --- a/controller/local_data.h
> >> >> +++ b/controller/local_data.h
> >> >> @@ -150,7 +150,7 @@ bool local_nonvif_data_handle_ovs_iface_changes(
> >> >>  struct chassis_tunnel *chassis_tunnel_find(const struct hmap
> *chassis_tunnels,
> >> >>                                             const char *chassis_id,
> >> >>                                             char *remote_encap_ip,
> >> >> -                                           char *local_encap_ip);
> >> >> +                                           const char
> *local_encap_ip);
> >> >
> >> >
> >> > Same as above.
> >>
> >>
> >> Ack
> >>
> >> >
> >> >
> >> >>
> >> >>  bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels,
> >> >>                                 const char *chassis_name,
> >> >> diff --git a/controller/ovn-controller.8.xml
> b/controller/ovn-controller.8.xml
> >> >> index efa65e3fd927..5ebef048d721 100644
> >> >> --- a/controller/ovn-controller.8.xml
> >> >> +++ b/controller/ovn-controller.8.xml
> >> >> @@ -176,10 +176,32 @@
> >> >>
> >> >>        <dt><code>external_ids:ovn-encap-ip</code></dt>
> >> >>        <dd>
> >> >> -        The IP address that a chassis should use to connect to this
> node
> >> >> -        using encapsulation types specified by
> >> >> -        <code>external_ids:ovn-encap-type</code>. Multiple
> encapsulation IPs
> >> >> -        may be specified with a comma-separated list.
> >> >> +        <p>
> >> >> +          The IP address that a chassis should use to connect to
> this node
> >> >> +          using encapsulation types specified by
> >> >> +          <code>external_ids:ovn-encap-type</code>. Multiple
> encapsulation IPs
> >> >> +          may be specified with a comma-separated list.
> >> >> +        </p>
> >> >> +        <p>
> >> >> +          In scenarios where multiple encapsulation IPs are
> present, distinct
> >> >> +          tunnels are established for each remote chassis. These
> tunnels are
> >> >> +          differentiated by setting unique
> <code>options:local_ip</code> and
> >> >> +          <code>options:remote_ip</code> values in the tunnel
> interface. When
> >> >> +          transmitting a packet to a remote chassis, the selection
> of local_ip
> >> >> +          is guided by the
> <code>Interface:external_ids:encap-ip</code> from
> >> >> +          the local OVSDB, corresponding to the VIF originating the
> packet, if
> >> >> +          specified. The
> <code>Interface:external_ids:encap-ip</code> setting
> >> >> +          of the VIF is also populated to the
> <code>Port_Binding</code>
> >> >> +          table in the OVN SB database via the <code>encap</code>
> column.
> >> >> +          Consequently, when a remote chassis needs to send a
> packet to a
> >> >> +          port-binding associated with this VIF, it utilizes the
> tunnel with
> >> >> +          the appropriate <code>options:remote_ip</code> that
> matches the
> >> >> +          <code>ip</code> in <code>Port_Binding:encap</code>. This
> mechanism
> >> >> +          is particularly beneficial for chassis with multiple
> physical
> >> >> +          interfaces designated for tunneling, where each interface
> is
> >> >> +          optimized for handling specific traffic associated with
> particular
> >> >> +          VIFs.
> >> >> +        </p>
> >> >>        </dd>
> >> >>
> >> >>        <dt><code>external_ids:ovn-encap-df_default</code></dt>
> >> >> diff --git a/controller/ovn-controller.c
> b/controller/ovn-controller.c
> >> >> index 856e5e270822..4ab57fe970fe 100644
> >> >> --- a/controller/ovn-controller.c
> >> >> +++ b/controller/ovn-controller.c
> >> >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output {
> >> >>      struct physical_debug debug;
> >> >>  };
> >> >>
> >> >> +static void
> >> >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table,
> >> >> +                size_t *n_encap_ips, const char * **encap_ips)
> >> >> +{
> >> >> +    const struct ovsrec_open_vswitch *cfg =
> >> >> +        ovsrec_open_vswitch_table_first(ovs_table);
> >> >> +    const char *chassis_id = get_ovs_chassis_id(ovs_table);
> >> >> +    const char *encap_ips_str =
> >> >> +        get_chassis_external_id_value(&cfg->external_ids,
> chassis_id,
> >> >> +                                      "ovn-encap-ip", NULL);
> >> >> +    struct sset encap_ip_set;
> >> >> +    sset_init(&encap_ip_set);
> >> >> +    sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");
> >> >> +
> >> >> +    /* Sort the ips so that their index is deterministic. */
> >> >> +    *encap_ips = sset_sort(&encap_ip_set);
> >> >> +
> >> >> +    /* Copy the strings so that we can destroy the sset. */
> >> >> +    for (size_t i = 0; (*encap_ips)[i]; i++) {
> >> >> +        (*encap_ips)[i] = xstrdup((*encap_ips)[i]);
> >> >> +    }
> >> >> +    *n_encap_ips = sset_count(&encap_ip_set);
> >> >> +    sset_destroy(&encap_ip_set);
> >> >> +}
> >> >> +
> >> >
> >> >
> >> > I wonder, could we store this array or maybe preferably svec in
> "en_non_vif_data" I-P node? This way it would be handled in the node and we
> could avoid the destroy calls after any pflow processing WDYT?
> >>
> >>
> >> Yes we could do the same in en_non_vif_data, but I think it is not
> really necessary and it seems more straightforward just parsing it here,
> because:
> >> 1. We don't need I-P for this ovn-encap-ip configuration change, so we
> don't have to persist this data.
> >> 2. It should be very rare to have more than 5 (or even 3) encap IPs per
> node, so the list should be very small and the cost of this parsing (and
> sset construct/destroy) is negligible.
> >> Using svec is also a valid option, but the sset (and array) is used
> here just to reuse the sset_from_delimited_string and sset_sort for
> convenience. I noticed that the sset_init() call can in fact be removed
> because sset_from_delimited_string already does that. I can remove it.
> >> Does this sound reasonable to you?
> >
> >
> > It makes sense, the main thing it would help with is the need to destroy
> the ctx in kinda unexpected places which might be slightly error prone.
> However, it being functionally the same it's fine either way.
> >
>
> Thanks Ales. So I will add the below small change. Would you give your Ack
> if it looks good to you?
> -----------------------
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index 4ab57fe970fe..6873c02288c6 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c
> @@ -4532,7 +4532,6 @@ parse_encap_ips(const struct
> ovsrec_open_vswitch_table *ovs_table,
>          get_chassis_external_id_value(&cfg->external_ids, chassis_id,
>                                        "ovn-encap-ip", NULL);
>      struct sset encap_ip_set;
> -    sset_init(&encap_ip_set);
>      sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");
>
>      /* Sort the ips so that their index is deterministic. */
> @@ -4615,8 +4614,6 @@ static void init_physical_ctx(struct engine_node
> *node,
>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
>
> -
> -
>      struct controller_engine_ctx *ctrl_ctx =
> engine_get_context()->client_ctx;
>      p_ctx->if_mgr = ctrl_ctx->if_mgr;
> -------------------------
>
> Thanks,
> Han
>


Yeah the diff looks ok.

Acked-by: Ales Musil <amusil@redhat.com>

Thanks,
Ales


>
> > Thanks,
> > Ales
> >
> >>
> >>
> >> Thanks,
> >> Han
> >>
> >> >
> >> >>
> >> >>  static void init_physical_ctx(struct engine_node *node,
> >> >>                                struct ed_type_runtime_data *rt_data,
> >> >>                                struct ed_type_non_vif_data
> *non_vif_data,
> >> >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct
> engine_node *node,
> >> >>          engine_get_input_data("ct_zones", node);
> >> >>      struct simap *ct_zones = &ct_zones_data->current;
> >> >>
> >> >> +    parse_encap_ips(ovs_table, &p_ctx->n_encap_ips,
> &p_ctx->encap_ips);
> >> >>      p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name;
> >> >>      p_ctx->sbrec_port_binding_by_datapath =
> sbrec_port_binding_by_datapath;
> >> >>      p_ctx->port_binding_table = port_binding_table;
> >> >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct
> engine_node *node,
> >> >>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
> >> >>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
> >> >>
> >> >> +
> >> >> +
> >> >>
> >> >
> >> > nit: Unrelated change.
> >>
> >>
> >> Ack
> >> >
> >> >
> >> >>
> >> >>      struct controller_engine_ctx *ctrl_ctx =
> engine_get_context()->client_ctx;
> >> >>      p_ctx->if_mgr = ctrl_ctx->if_mgr;
> >> >>
> >> >>      pflow_output_get_debug(node, &p_ctx->debug);
> >> >>  }
> >> >>
> >> >> +static void
> >> >> +destroy_physical_ctx(struct physical_ctx *p_ctx)
> >> >> +{
> >> >> +    for (size_t i = 0; i < p_ctx->n_encap_ips; i++) {
> >> >> +        free((char *)(p_ctx->encap_ips[i]));
> >> >> +    }
> >> >> +    free(p_ctx->encap_ips);
> >> >> +}
> >> >> +
> >> >>  static void *
> >> >>  en_pflow_output_init(struct engine_node *node OVS_UNUSED,
> >> >>                               struct engine_arg *arg OVS_UNUSED)
> >> >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node *node,
> void *data)
> >> >>      struct physical_ctx p_ctx;
> >> >>      init_physical_ctx(node, rt_data, non_vif_data, &p_ctx);
> >> >>      physical_run(&p_ctx, pflow_table);
> >> >> +    destroy_physical_ctx(&p_ctx);
> >> >>
> >> >>      engine_set_node_state(node, EN_UPDATED);
> >> >>  }
> >> >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct
> engine_node *node,
> >> >>                  bool removed =
> sbrec_port_binding_is_deleted(binding);
> >> >>                  if (!physical_handle_flows_for_lport(binding,
> removed, &p_ctx,
> >> >>
> &pfo->flow_table)) {
> >> >> +                    destroy_physical_ctx(&p_ctx);
> >> >>                      return false;
> >> >>                  }
> >> >>              }
> >> >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct
> engine_node *node,
> >> >>              bool removed = sbrec_port_binding_is_deleted(pb);
> >> >>              if (!physical_handle_flows_for_lport(pb, removed,
> &p_ctx,
> >> >>                                                   &pfo->flow_table))
> {
> >> >> +                destroy_physical_ctx(&p_ctx);
> >> >>                  return false;
> >> >>              }
> >> >>          }
> >> >>          engine_set_node_state(node, EN_UPDATED);
> >> >>      }
> >> >> +    destroy_physical_ctx(&p_ctx);
> >> >>      return true;
> >> >>  }
> >> >>
> >> >> @@ -4715,11 +4756,13 @@ pflow_output_sb_port_binding_handler(struct
> engine_node *node,
> >> >>          bool removed = sbrec_port_binding_is_deleted(pb);
> >> >>          if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
> >> >>                                               &pfo->flow_table)) {
> >> >> +            destroy_physical_ctx(&p_ctx);
> >> >>              return false;
> >> >>          }
> >> >>      }
> >> >>
> >> >>      engine_set_node_state(node, EN_UPDATED);
> >> >> +    destroy_physical_ctx(&p_ctx);
> >> >>      return true;
> >> >>  }
> >> >>
> >> >> @@ -4739,6 +4782,7 @@ pflow_output_sb_multicast_group_handler(struct
> engine_node *node, void *data)
> >> >>      physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table);
> >> >>
> >> >>      engine_set_node_state(node, EN_UPDATED);
> >> >> +    destroy_physical_ctx(&p_ctx);
> >> >>      return true;
> >> >>  }
> >> >>
> >> >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct
> engine_node *node, void *data)
> >> >>          if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) {
> >> >>              /* Fall back to full recompute when a local datapath
> >> >>               * is added or deleted. */
> >> >> +            destroy_physical_ctx(&p_ctx);
> >> >>              return false;
> >> >>          }
> >> >>
> >> >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct
> engine_node *node, void *data)
> >> >>                  lport->tracked_type == TRACKED_RESOURCE_REMOVED ?
> true: false;
> >> >>              if (!physical_handle_flows_for_lport(lport->pb,
> removed, &p_ctx,
> >> >>                                                   &pfo->flow_table))
> {
> >> >> +                destroy_physical_ctx(&p_ctx);
> >> >>                  return false;
> >> >>              }
> >> >>          }
> >> >>      }
> >> >>
> >> >>      engine_set_node_state(node, EN_UPDATED);
> >> >> +    destroy_physical_ctx(&p_ctx);
> >> >>      return true;
> >> >>  }
> >> >>
> >> >> @@ -4843,12 +4890,14 @@ pflow_output_activated_ports_handler(struct
> engine_node *node, void *data)
> >> >>          if (pb) {
> >> >>              if (!physical_handle_flows_for_lport(pb, false, &p_ctx,
> >> >>                                                   &pfo->flow_table))
> {
> >> >> +                destroy_physical_ctx(&p_ctx);
> >> >>                  return false;
> >> >>              }
> >> >>              tag_port_as_activated_in_engine(pp);
> >> >>          }
> >> >>      }
> >> >>      engine_set_node_state(node, EN_UPDATED);
> >> >> +    destroy_physical_ctx(&p_ctx);
> >> >>      return true;
> >> >>  }
> >> >>
> >> >> diff --git a/controller/physical.c b/controller/physical.c
> >> >> index e93bfbd2cffb..c192aed751d5 100644
> >> >> --- a/controller/physical.c
> >> >> +++ b/controller/physical.c
> >> >> @@ -71,6 +71,8 @@ struct tunnel {
> >> >>  static void
> >> >>  load_logical_ingress_metadata(const struct sbrec_port_binding
> *binding,
> >> >>                                const struct zone_ids *zone_ids,
> >> >> +                              size_t n_encap_ips,
> >> >> +                              const char **encap_ips,
> >> >>                                struct ofpbuf *ofpacts_p);
> >> >>  static int64_t get_vxlan_port_key(int64_t port_key);
> >> >>
> >> >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf
> *ofpacts)
> >> >>   */
> >> >>  static struct chassis_tunnel *
> >> >>  get_port_binding_tun(const struct sbrec_encap *remote_encap,
> >> >> -                     const struct sbrec_encap *local_encap,
> >> >>                       const struct sbrec_chassis *chassis,
> >> >> -                     const struct hmap *chassis_tunnels)
> >> >> +                     const struct hmap *chassis_tunnels,
> >> >> +                     const char *local_encap_ip)
> >> >>  {
> >> >>      struct chassis_tunnel *tun = NULL;
> >> >>      tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
> >> >>                                remote_encap ? remote_encap->ip :
> NULL,
> >> >> -                              local_encap ? local_encap->ip : NULL);
> >> >> +                              local_encap_ip);
> >> >>
> >> >>      if (!tun) {
> >> >>          tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
> NULL, NULL);
> >> >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct
> sbrec_port_binding *pb,
> >> >>  static struct ovs_list *
> >> >>  get_remote_tunnels(const struct sbrec_port_binding *binding,
> >> >>                     const struct sbrec_chassis *chassis,
> >> >> -                   const struct hmap *chassis_tunnels)
> >> >> +                   const struct hmap *chassis_tunnels,
> >> >> +                   const char *local_encap_ip)
> >> >>  {
> >> >>      const struct chassis_tunnel *tun;
> >> >>
> >> >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct
> sbrec_port_binding *binding,
> >> >>      ovs_list_init(tunnels);
> >> >>
> >> >>      if (binding->chassis && binding->chassis != chassis) {
> >> >> -        tun = get_port_binding_tun(binding->encap, NULL,
> binding->chassis,
> >> >> -                                   chassis_tunnels);
> >> >> +        tun = get_port_binding_tun(binding->encap, binding->chassis,
> >> >> +                chassis_tunnels, local_encap_ip);
> >> >>          if (!tun) {
> >> >>              static struct vlog_rate_limit rl =
> VLOG_RATE_LIMIT_INIT(5, 1);
> >> >>              VLOG_WARN_RL(
> >> >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct
> sbrec_port_binding *binding,
> >> >>          }
> >> >>          const struct sbrec_encap *additional_encap;
> >> >>          additional_encap =
> find_additional_encap_for_chassis(binding, chassis);
> >> >> -        tun = get_port_binding_tun(additional_encap, NULL,
> >> >> +        tun = get_port_binding_tun(additional_encap,
> >> >>                                     binding->additional_chassis[i],
> >> >> -                                   chassis_tunnels);
> >> >> +                                   chassis_tunnels, local_encap_ip);
> >> >>          if (!tun) {
> >> >>              static struct vlog_rate_limit rl =
> VLOG_RATE_LIMIT_INIT(5, 1);
> >> >>              VLOG_WARN_RL(
> >> >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const struct
> sbrec_port_binding *binding,
> >> >>                                   struct ofpbuf *ofpacts_p,
> >> >>                                   const struct sbrec_chassis
> *chassis,
> >> >>                                   const struct hmap *chassis_tunnels,
> >> >> +                                 size_t n_encap_ips,
> >> >> +                                 const char **encap_ips,
> >> >>                                   struct ovn_desired_flow_table
> *flow_table)
> >> >>  {
> >> >>      /* Setup encapsulation */
> >> >> -    struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> >> >> -                                               chassis_tunnels);
> >> >> -    if (!ovs_list_is_empty(tuns)) {
> >> >> -        bool is_vtep_port = !strcmp(binding->type, "vtep");
> >> >> -        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check
> for ARP/ND
> >> >> -         * responder in L3 networks. */
> >> >> -        if (is_vtep_port) {
> >> >> -            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
> ofpacts_p);
> >> >> -        }
> >> >> +    for (size_t i = 0; i < n_encap_ips; i++) {
> >> >> +        struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p);
> >> >>
> >> >> +
> >> >> +        match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i
> << 16,
> >> >> +                             (uint32_t) 0xFFFF << 16);
> >> >> +        struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> >> >> +                                                   chassis_tunnels,
> >> >> +                                                   encap_ips[i]);
> >> >> +        if (!ovs_list_is_empty(tuns)) {
> >> >> +            bool is_vtep_port = !strcmp(binding->type, "vtep");
> >> >> +            /* rewrite MFF_IN_PORT to bypass OpenFlow loopback
> check for ARP/ND
> >> >> +             * responder in L3 networks. */
> >> >> +            if (is_vtep_port) {
> >> >> +                put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
> >> >> +                         ofpacts_clone);
> >> >> +            }
> >> >>
> >> >> -        struct tunnel *tun;
> >> >> -        LIST_FOR_EACH (tun, list_node, tuns) {
> >> >> -            put_encapsulation(mff_ovn_geneve, tun->tun,
> >> >> -                              binding->datapath, port_key,
> is_vtep_port,
> >> >> -                              ofpacts_p);
> >> >> -            ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport;
> >> >> +            struct tunnel *tun;
> >> >> +            LIST_FOR_EACH (tun, list_node, tuns) {
> >> >> +                put_encapsulation(mff_ovn_geneve, tun->tun,
> >> >> +                                  binding->datapath, port_key,
> is_vtep_port,
> >> >> +                                  ofpacts_clone);
> >> >> +                ofpact_put_OUTPUT(ofpacts_clone)->port =
> tun->tun->ofport;
> >> >> +            }
> >> >> +            put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone);
> >> >> +            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
> >> >> +                            binding->header_.uuid.parts[0], match,
> >> >> +                            ofpacts_clone, &binding->header_.uuid);
> >> >>          }
> >> >> -        put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
> >> >> -        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
> >> >> -                        binding->header_.uuid.parts[0], match,
> ofpacts_p,
> >> >> -                        &binding->header_.uuid);
> >> >> -    }
> >> >> -    struct tunnel *tun_elem;
> >> >> -    LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
> >> >> -        free(tun_elem);
> >> >> +        struct tunnel *tun_elem;
> >> >> +        LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
> >> >> +            free(tun_elem);
> >> >> +        }
> >> >> +        free(tuns);
> >> >> +
> >> >> +        ofpbuf_delete(ofpacts_clone);
> >> >>      }
> >> >> -    free(tuns);
> >> >>  }
> >> >>
> >> >>  static void
> >> >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct simap
> *ct_zones,
> >> >>          if (tag) {
> >> >>              ofpact_put_STRIP_VLAN(ofpacts_p);
> >> >>          }
> >> >> -        load_logical_ingress_metadata(localnet_port, &zone_ids,
> ofpacts_p);
> >> >> +        load_logical_ingress_metadata(localnet_port, &zone_ids, 0,
> NULL,
> >> >> +                                      ofpacts_p);
> >> >>          replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p);
> >> >>          replace_mac->mac = router_port_mac;
> >> >>
> >> >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids
> *zone_ids, struct ofpbuf *ofpacts_p)
> >> >>  {
> >> >>      if (zone_ids) {
> >> >>          if (zone_ids->ct) {
> >> >> -            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32,
> ofpacts_p);
> >> >> +            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16,
> ofpacts_p);
> >> >>          }
> >> >>          if (zone_ids->dnat) {
> >> >>              put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32,
> ofpacts_p);
> >> >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key,
> >> >>      }
> >> >>  }
> >> >>
> >> >> +static size_t
> >> >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips,
> >> >> +               const char *ip)
> >> >> +{
> >> >> +    for (size_t i = 0; i < n_encap_ips; i++) {
> >> >> +        if (!strcmp(ip, encap_ips[i])) {
> >> >> +            return i;
> >> >> +        }
> >> >> +    }
> >> >> +    return 0;
> >> >> +}
> >> >> +
> >> >>  static void
> >> >>  load_logical_ingress_metadata(const struct sbrec_port_binding
> *binding,
> >> >>                                const struct zone_ids *zone_ids,
> >> >> +                              size_t n_encap_ips,
> >> >> +                              const char **encap_ips,
> >> >>                                struct ofpbuf *ofpacts_p)
> >> >>  {
> >> >>      put_zones_ofpacts(zone_ids, ofpacts_p);
> >> >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct
> sbrec_port_binding *binding,
> >> >>      uint32_t port_key = binding->tunnel_key;
> >> >>      put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p);
> >> >>      put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p);
> >> >> +
> >> >> +    /* Default encap_id 0. */
> >> >> +    size_t encap_id = 0;
> >> >> +    if (encap_ips && binding->encap) {
> >> >> +        encap_id = encap_ip_to_id(n_encap_ips, encap_ips,
> >> >> +                                  binding->encap->ip);
> >> >> +    }
> >> >> +    put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p);
> >> >>  }
> >> >>
> >> >>  static const struct sbrec_port_binding *
> >> >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct
> sbrec_port_binding *binding,
> >> >>      match_set_dl_type(&match, htons(ETH_TYPE_RARP));
> >> >>      match_set_in_port(&match, ofport);
> >> >>
> >> >> -    load_logical_ingress_metadata(binding, zone_ids, &ofpacts);
> >> >> +    load_logical_ingress_metadata(binding, zone_ids, 0, NULL,
> &ofpacts);
> >> >>
> >> >>      encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
> >> >>                           NX_CTLR_NO_METER, &ofpacts);
> >> >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports(
> >> >>      }
> >> >>
> >> >>      struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
> >> >> -                                               chassis_tunnels);
> >> >> +                                               chassis_tunnels,
> NULL);
> >> >>      if (ovs_list_is_empty(tuns)) {
> >> >>          free(tuns);
> >> >>          return;
> >> >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >> >>                        const struct sbrec_chassis *chassis,
> >> >>                        const struct physical_debug *debug,
> >> >>                        const struct if_status_mgr *if_mgr,
> >> >> +                      size_t n_encap_ips,
> >> >> +                      const char **encap_ips,
> >> >>                        struct ovn_desired_flow_table *flow_table,
> >> >>                        struct ofpbuf *ofpacts_p)
> >> >>  {
> >> >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >> >>          ofpact_put_CT_CLEAR(ofpacts_p);
> >> >>          put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
> >> >>          put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
> >> >> -        put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
> >> >> +        put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
> >> >>          struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
> >> >> -        load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
> >> >> +        load_logical_ingress_metadata(peer, &peer_zones,
> n_encap_ips,
> >> >> +                                      encap_ips, ofpacts_p);
> >> >>          put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
> >> >>          put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
> >> >>          for (int i = 0; i < MFF_N_LOG_REGS; i++) {
> >> >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >> >>           * as we're going to remove this with ofpbuf_pull() later.
> */
> >> >>          uint32_t ofpacts_orig_size = ofpacts_p->size;
> >> >>
> >> >> -        load_logical_ingress_metadata(binding, &zone_ids,
> ofpacts_p);
> >> >> +        load_logical_ingress_metadata(binding, &zone_ids,
> n_encap_ips,
> >> >> +                                      encap_ips, ofpacts_p);
> >> >>
> >> >>          if (!strcmp(binding->type, "localport")) {
> >> >>              /* mark the packet as incoming from a localport */
> >> >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >> >>      } else {
> >> >>          put_remote_port_redirect_overlay(
> >> >>              binding, mff_ovn_geneve, port_key, &match, ofpacts_p,
> >> >> -            chassis, chassis_tunnels, flow_table);
> >> >> +            chassis, chassis_tunnels, n_encap_ips, encap_ips,
> flow_table);
> >> >>      }
> >> >>  out:
> >> >>      if (ha_ch_ordered) {
> >> >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> >> >>
> >> >>          int zone_id = simap_get(ct_zones, port->logical_port);
> >> >>          if (zone_id) {
> >> >> -            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts);
> >> >> +            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts);
> >> >>          }
> >> >>
> >> >>          const char *lport_name = (port->parent_port &&
> *port->parent_port) ?
> >> >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct
> physical_ctx *p_ctx,
> >> >>                            p_ctx->patch_ofports,
> >> >>                            p_ctx->chassis_tunnels,
> >> >>                            pb, p_ctx->chassis, &p_ctx->debug,
> >> >> -                          p_ctx->if_mgr, flow_table, &ofpacts);
> >> >> +                          p_ctx->if_mgr,
> >> >> +                          p_ctx->n_encap_ips,
> >> >> +                          p_ctx->encap_ips,
> >> >> +                          flow_table, &ofpacts);
> >> >>      ofpbuf_uninit(&ofpacts);
> >> >>  }
> >> >>
> >> >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx,
> >> >>                                p_ctx->patch_ofports,
> >> >>                                p_ctx->chassis_tunnels, binding,
> >> >>                                p_ctx->chassis, &p_ctx->debug,
> >> >> -                              p_ctx->if_mgr, flow_table, &ofpacts);
> >> >> +                              p_ctx->if_mgr,
> >> >> +                              p_ctx->n_encap_ips,
> >> >> +                              p_ctx->encap_ips,
> >> >> +                              flow_table, &ofpacts);
> >> >>      }
> >> >>
> >> >>      /* Handle output to multicast groups, in tables 40 and 41. */
> >> >> diff --git a/controller/physical.h b/controller/physical.h
> >> >> index 1f1ed55efa16..7fe8ee3c18ed 100644
> >> >> --- a/controller/physical.h
> >> >> +++ b/controller/physical.h
> >> >> @@ -66,6 +66,8 @@ struct physical_ctx {
> >> >>      struct shash *local_bindings;
> >> >>      struct simap *patch_ofports;
> >> >>      struct hmap *chassis_tunnels;
> >> >> +    size_t n_encap_ips;
> >> >> +    const char **encap_ips;
> >> >>      struct physical_debug debug;
> >> >>  };
> >> >>
> >> >> diff --git a/include/ovn/logical-fields.h
> b/include/ovn/logical-fields.h
> >> >> index 272277ec4fa0..d3455a462a0d 100644
> >> >> --- a/include/ovn/logical-fields.h
> >> >> +++ b/include/ovn/logical-fields.h
> >> >> @@ -37,7 +37,9 @@ enum ovn_controller_event {
> >> >>  #define MFF_LOG_SNAT_ZONE  MFF_REG12  /* conntrack snat zone for
> gateway router
> >> >>                                         * (32 bits). */
> >> >>  #define MFF_LOG_CT_ZONE    MFF_REG13  /* Logical conntrack zone for
> lports
> >> >> -                                       * (32 bits). */
> >> >> +                                       * (0..15 of the 32 bits). */
> >> >> +#define MFF_LOG_ENCAP_ID   MFF_REG13  /* Encap ID for lports
> >> >> +                                       * (16..31 of the 32 bits). */
> >> >>  #define MFF_LOG_INPORT     MFF_REG14  /* Logical input port (32
> bits). */
> >> >>  #define MFF_LOG_OUTPORT    MFF_REG15  /* Logical output port (32
> bits). */
> >> >>
> >> >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
> >> >> index 96294fe2b795..bfd8680cedfc 100644
> >> >> --- a/ovn-architecture.7.xml
> >> >> +++ b/ovn-architecture.7.xml
> >> >> @@ -1247,8 +1247,8 @@
> >> >>        chassis.  This is initialized to 0 at the beginning of the
> logical
> >> >>          <!-- Keep the following in sync with MFF_LOG_CT_ZONE in
> >> >>               ovn/lib/logical-fields.h. -->
> >> >> -      ingress pipeline.  OVN stores this in Open vSwitch extension
> register
> >> >> -      number 13.
> >> >> +      ingress pipeline.  OVN stores this in the lower 16 bits of
> the Open
> >> >> +      vSwitch extension register number 13.
> >> >>      </dd>
> >> >>
> >> >>      <dt>conntrack zone fields for routers</dt>
> >> >> @@ -1263,6 +1263,20 @@
> >> >>        traffic (for SNATing) in Open vSwitch extension register
> number 12.
> >> >>      </dd>
> >> >>
> >> >> +    <dt>Encap ID for logical ports</dt>
> >> >> +    <dd>
> >> >> +      A field that records an ID that indicates which encapsulation
> IP should
> >> >> +      be used when sending packets to a remote chassis, according
> to the
> >> >> +      original input logical port. This is useful when there are
> multiple IPs
> >> >> +      available for encapsulation. The value only has local
> significance and is
> >> >> +      not meaningful between chassis. This is initialized to 0 at
> the beginning
> >> >> +      of the logical
> >> >> +        <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in
> >> >> +             ovn/lib/logical-fields.h. -->
> >> >> +      ingress pipeline.  OVN stores this in the higher 16 bits of
> the Open
> >> >> +      vSwitch extension register number 13.
> >> >> +    </dd>
> >> >> +
> >> >>      <dt>logical flow flags</dt>
> >> >>      <dd>
> >> >>        The logical flags are intended to handle keeping context
> between
> >> >> diff --git a/tests/ovn.at b/tests/ovn.at
> >> >> index 243fe0b8246c..e7f0c9681f60 100644
> >> >> --- a/tests/ovn.at
> >> >> +++ b/tests/ovn.at
> >> >> @@ -30335,19 +30335,33 @@ AT_CLEANUP
> >> >>
> >> >>
> >> >>  OVN_FOR_EACH_NORTHD([
> >> >> -AT_SETUP([multiple encap ips tunnel creation])
> >> >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip])
> >> >> +AT_SKIP_IF([test $HAVE_SCAPY = no])
> >> >>  ovn_start
> >> >>  net_add n1
> >> >>
> >> >> +ovn-nbctl ls-add ls1
> >> >> +
> >> >>  # 2 HVs, each with 2 encap-ips.
> >> >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY.
> >> >>  for i in 1 2; do
> >> >>      sim_add hv$i
> >> >>      as hv$i
> >> >>      ovs-vsctl add-br br-phys-$j
> >> >>      ovn_attach n1 br-phys-$j 192.168.0.${i}1
> >> >>      ovs-vsctl set open .
> external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2
> >> >> +
> >> >> +    for j in 1 2; do
> >> >> +        ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \
> >> >> +            external_ids:iface-id=lsp$i$j \
> >> >> +            external_ids:encap-ip=192.168.0.$i$j \
> >> >> +            options:tx_pcap=hv$i/vif$i$j-tx.pcap
> options:rxq_pcap=hv$i/vif$i$j-rx.pcap
> >> >> +        ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j
> "f0:00:00:00:00:$i$j 10.0.0.$i$j"
> >> >> +
> >> >> +    done
> >> >>  done
> >> >>
> >> >> +wait_for_ports_up
> >> >>  check ovn-nbctl --wait=hv sync
> >> >>
> >> >>  check_tunnel_port() {
> >> >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int
> hv1@192.168.0.12%192.168.0.21
> >> >>  check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22
> >> >>  check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22
> >> >>
> >> >> +vif_to_hv() {
> >> >> +    case $1 in dnl (
> >> >> +        vif[[1]]?) echo hv1 ;; dnl (
> >> >> +        vif[[2]]?) echo hv2 ;; dnl (
> >> >> +        *) AT_FAIL_IF([:]) ;;
> >> >> +    esac
> >> >> +}
> >> >> +
> >> >> +# check_packet_tunnel SRC_VIF DST_VIF
> >> >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the
> corresponding
> >> >> +# tunnel that matches the VIFs' encap_ip configurations.
> >> >> +check_packet_tunnel() {
> >> >> +    local src=$1 dst=$2
> >> >> +    local src_mac=f0:00:00:00:00:$src
> >> >> +    local dst_mac=f0:00:00:00:00:$dst
> >> >> +    local src_ip=10.0.0.$src
> >> >> +    local dst_ip=10.0.0.$dst
> >> >> +    local local_encap_ip=192.168.0.$src
> >> >> +    local remote_encap_ip=192.168.0.$dst
> >> >> +    local packet=$(fmt_pkt "Ether(dst='${dst_mac}',
> src='${src_mac}')/ \
> >> >> +                            IP(dst='${dst_ip}', src='${src_ip}')/ \
> >> >> +                            ICMP(type=8)")
> >> >> +    hv=`vif_to_hv vif$src`
> >> >> +    as $hv
> >> >> +    echo "vif$src -> vif$dst should go through tunnel
> $local_encap_ip -> $remote_encap_ip"
> >> >> +    tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface
> options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip)
> >> >> +    AT_CHECK([test $(ovs-appctl ofproto/trace br-int
> in_port=vif$src $packet | grep "output\:" | awk -F ':' '{ print $2 }') ==
> $tunnel_ofport])
> >> >> +}
> >> >> +
> >> >> +for i in 1 2; do
> >> >> +    for j in 1 2; do
> >> >> +        check_packet_tunnel 1$i 2$j
> >> >> +    done
> >> >> +done
> >> >> +
> >> >>  OVN_CLEANUP([hv1],[hv2])
> >> >>  AT_CLEANUP
> >> >>  ])
> >> >> --
> >> >> 2.38.1
> >> >>
> >> >> _______________________________________________
> >> >> dev mailing list
> >> >> dev@openvswitch.org
> >> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >> >>
> >> >
> >> > Thanks,
> >> > Ales
> >> >
> >> > --
> >> >
> >> > Ales Musil
> >> >
> >> > Senior Software Engineer - OVN Core
> >> >
> >> > Red Hat EMEA
> >> >
> >> > amusil@redhat.com
> >
> >
> >
> > --
> >
> > Ales Musil
> >
> > Senior Software Engineer - OVN Core
> >
> > Red Hat EMEA
> >
> > amusil@redhat.com
>
Han Zhou Jan. 30, 2024, 5:40 a.m. UTC | #6
On Mon, Jan 29, 2024 at 2:41 AM Ales Musil <amusil@redhat.com> wrote:
>
>
>
> On Fri, Jan 26, 2024 at 8:05 PM Han Zhou <hzhou@ovn.org> wrote:
>>
>>
>>
>> On Thu, Jan 25, 2024 at 10:54 PM Ales Musil <amusil@redhat.com> wrote:
>> >
>> >
>> >
>> > On Fri, Jan 26, 2024 at 4:07 AM Han Zhou <hzhou@ovn.org> wrote:
>> >>
>> >>
>> >>
>> >> On Tue, Jan 23, 2024 at 5:29 AM Ales Musil <amusil@redhat.com> wrote:
>> >> >
>> >> >
>> >> >
>> >> > On Wed, Jan 17, 2024 at 6:48 AM Han Zhou <hzhou@ovn.org> wrote:
>> >> >>
>> >> >> Commit dd527a283cd8 partially supported multiple encap IPs. It
supported
>> >> >> remote encap IP selection based on the destination VIF's encap_ip
>> >> >> configuration. This patch adds the support for selecting local
encap IP
>> >> >> based on the source VIF's encap_ip configuration.
>> >> >>
>> >> >> Co-authored-by: Lei Huang <leih@nvidia.com>
>> >> >> Signed-off-by: Lei Huang <leih@nvidia.com>
>> >> >> Signed-off-by: Han Zhou <hzhou@ovn.org>
>> >> >> ---
>> >> >
>> >> >
>> >> > Hi Han and Lei,
>> >> >
>> >> > thank you for the patch, I have a couple of comments/questions down
below.
>> >>
>> >>
>> >> Thanks Ales.
>> >>
>> >> >
>> >> >
>> >> >>  NEWS                            |   3 +
>> >> >>  controller/chassis.c            |   2 +-
>> >> >>  controller/local_data.c         |   2 +-
>> >> >>  controller/local_data.h         |   2 +-
>> >> >>  controller/ovn-controller.8.xml |  30 ++++++-
>> >> >>  controller/ovn-controller.c     |  49 ++++++++++++
>> >> >>  controller/physical.c           | 134
++++++++++++++++++++++----------
>> >> >>  controller/physical.h           |   2 +
>> >> >>  include/ovn/logical-fields.h    |   4 +-
>> >> >>  ovn-architecture.7.xml          |  18 ++++-
>> >> >>  tests/ovn.at                    |  51 +++++++++++-
>> >> >>  11 files changed, 243 insertions(+), 54 deletions(-)
>> >> >>
>> >> >> diff --git a/NEWS b/NEWS
>> >> >> index 5f267b4c64cc..5a3eed608617 100644
>> >> >> --- a/NEWS
>> >> >> +++ b/NEWS
>> >> >> @@ -14,6 +14,9 @@ Post v23.09.0
>> >> >>    - ovn-northd-ddlog has been removed.
>> >> >>    - A new LSP option "enable_router_port_acl" has been added to
enable
>> >> >>      conntrack for the router port whose peer is l3dgw_port if set
it true.
>> >> >> +  - Support selecting encapsulation IP based on the
source/destination VIF's
>> >> >> +    settting. See ovn-controller(8) 'external_ids:ovn-encap-ip'
for more
>> >> >> +    details.
>> >> >>
>> >> >>  OVN v23.09.0 - 15 Sep 2023
>> >> >>  --------------------------
>> >> >> diff --git a/controller/chassis.c b/controller/chassis.c
>> >> >> index a6f13ccc42d5..55f2beb37674 100644
>> >> >> --- a/controller/chassis.c
>> >> >> +++ b/controller/chassis.c
>> >> >> @@ -61,7 +61,7 @@ struct ovs_chassis_cfg {
>> >> >>
>> >> >>      /* Set of encap types parsed from the 'ovn-encap-type'
external-id. */
>> >> >>      struct sset encap_type_set;
>> >> >> -    /* Set of encap IPs parsed from the 'ovn-encap-type'
external-id. */
>> >> >> +    /* Set of encap IPs parsed from the 'ovn-encap-ip'
external-id. */
>> >> >>      struct sset encap_ip_set;
>> >> >>      /* Interface type list formatted in the OVN-SB Chassis
required format. */
>> >> >>      struct ds iface_types;
>> >> >> diff --git a/controller/local_data.c b/controller/local_data.c
>> >> >> index a9092783958f..8606414f8728 100644
>> >> >> --- a/controller/local_data.c
>> >> >> +++ b/controller/local_data.c
>> >> >> @@ -514,7 +514,7 @@ chassis_tunnels_destroy(struct hmap
*chassis_tunnels)
>> >> >>   */
>> >> >>  struct chassis_tunnel *
>> >> >>  chassis_tunnel_find(const struct hmap *chassis_tunnels, const
char *chassis_id,
>> >> >> -                    char *remote_encap_ip, char *local_encap_ip)
>> >> >> +                    char *remote_encap_ip, const char
*local_encap_ip)
>> >> >
>> >> >
>> >> > nit: Unrelated change.
>> >>
>> >>
>> >> Ack
>>
>> Hi Ales, sorry that I just realized this change is related, which is
because of the const char * array introduced in this patch that stores the
parsed encap_ips, and it makes sense to use const char * since we should
never expect this string to be modified in the function.
>>
>> >>
>> >> >
>> >> >
>> >> >>  {
>> >> >>      /*
>> >> >>       * If the specific encap_ip is given, look for the
chassisid_ip entry,
>> >> >> diff --git a/controller/local_data.h b/controller/local_data.h
>> >> >> index bab95bcc3824..ca3905bd20e6 100644
>> >> >> --- a/controller/local_data.h
>> >> >> +++ b/controller/local_data.h
>> >> >> @@ -150,7 +150,7 @@ bool
local_nonvif_data_handle_ovs_iface_changes(
>> >> >>  struct chassis_tunnel *chassis_tunnel_find(const struct hmap
*chassis_tunnels,
>> >> >>                                             const char *chassis_id,
>> >> >>                                             char *remote_encap_ip,
>> >> >> -                                           char *local_encap_ip);
>> >> >> +                                           const char
*local_encap_ip);
>> >> >
>> >> >
>> >> > Same as above.
>> >>
>> >>
>> >> Ack
>> >>
>> >> >
>> >> >
>> >> >>
>> >> >>  bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels,
>> >> >>                                 const char *chassis_name,
>> >> >> diff --git a/controller/ovn-controller.8.xml
b/controller/ovn-controller.8.xml
>> >> >> index efa65e3fd927..5ebef048d721 100644
>> >> >> --- a/controller/ovn-controller.8.xml
>> >> >> +++ b/controller/ovn-controller.8.xml
>> >> >> @@ -176,10 +176,32 @@
>> >> >>
>> >> >>        <dt><code>external_ids:ovn-encap-ip</code></dt>
>> >> >>        <dd>
>> >> >> -        The IP address that a chassis should use to connect to
this node
>> >> >> -        using encapsulation types specified by
>> >> >> -        <code>external_ids:ovn-encap-type</code>. Multiple
encapsulation IPs
>> >> >> -        may be specified with a comma-separated list.
>> >> >> +        <p>
>> >> >> +          The IP address that a chassis should use to connect to
this node
>> >> >> +          using encapsulation types specified by
>> >> >> +          <code>external_ids:ovn-encap-type</code>. Multiple
encapsulation IPs
>> >> >> +          may be specified with a comma-separated list.
>> >> >> +        </p>
>> >> >> +        <p>
>> >> >> +          In scenarios where multiple encapsulation IPs are
present, distinct
>> >> >> +          tunnels are established for each remote chassis. These
tunnels are
>> >> >> +          differentiated by setting unique
<code>options:local_ip</code> and
>> >> >> +          <code>options:remote_ip</code> values in the tunnel
interface. When
>> >> >> +          transmitting a packet to a remote chassis, the
selection of local_ip
>> >> >> +          is guided by the
<code>Interface:external_ids:encap-ip</code> from
>> >> >> +          the local OVSDB, corresponding to the VIF originating
the packet, if
>> >> >> +          specified. The
<code>Interface:external_ids:encap-ip</code> setting
>> >> >> +          of the VIF is also populated to the
<code>Port_Binding</code>
>> >> >> +          table in the OVN SB database via the <code>encap</code>
column.
>> >> >> +          Consequently, when a remote chassis needs to send a
packet to a
>> >> >> +          port-binding associated with this VIF, it utilizes the
tunnel with
>> >> >> +          the appropriate <code>options:remote_ip</code> that
matches the
>> >> >> +          <code>ip</code> in <code>Port_Binding:encap</code>.
This mechanism
>> >> >> +          is particularly beneficial for chassis with multiple
physical
>> >> >> +          interfaces designated for tunneling, where each
interface is
>> >> >> +          optimized for handling specific traffic associated with
particular
>> >> >> +          VIFs.
>> >> >> +        </p>
>> >> >>        </dd>
>> >> >>
>> >> >>        <dt><code>external_ids:ovn-encap-df_default</code></dt>
>> >> >> diff --git a/controller/ovn-controller.c
b/controller/ovn-controller.c
>> >> >> index 856e5e270822..4ab57fe970fe 100644
>> >> >> --- a/controller/ovn-controller.c
>> >> >> +++ b/controller/ovn-controller.c
>> >> >> @@ -4521,6 +4521,31 @@ struct ed_type_pflow_output {
>> >> >>      struct physical_debug debug;
>> >> >>  };
>> >> >>
>> >> >> +static void
>> >> >> +parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table,
>> >> >> +                size_t *n_encap_ips, const char * **encap_ips)
>> >> >> +{
>> >> >> +    const struct ovsrec_open_vswitch *cfg =
>> >> >> +        ovsrec_open_vswitch_table_first(ovs_table);
>> >> >> +    const char *chassis_id = get_ovs_chassis_id(ovs_table);
>> >> >> +    const char *encap_ips_str =
>> >> >> +        get_chassis_external_id_value(&cfg->external_ids,
chassis_id,
>> >> >> +                                      "ovn-encap-ip", NULL);
>> >> >> +    struct sset encap_ip_set;
>> >> >> +    sset_init(&encap_ip_set);
>> >> >> +    sset_from_delimited_string(&encap_ip_set, encap_ips_str,
 ",");
>> >> >> +
>> >> >> +    /* Sort the ips so that their index is deterministic. */
>> >> >> +    *encap_ips = sset_sort(&encap_ip_set);
>> >> >> +
>> >> >> +    /* Copy the strings so that we can destroy the sset. */
>> >> >> +    for (size_t i = 0; (*encap_ips)[i]; i++) {
>> >> >> +        (*encap_ips)[i] = xstrdup((*encap_ips)[i]);
>> >> >> +    }
>> >> >> +    *n_encap_ips = sset_count(&encap_ip_set);
>> >> >> +    sset_destroy(&encap_ip_set);
>> >> >> +}
>> >> >> +
>> >> >
>> >> >
>> >> > I wonder, could we store this array or maybe preferably svec in
"en_non_vif_data" I-P node? This way it would be handled in the node and we
could avoid the destroy calls after any pflow processing WDYT?
>> >>
>> >>
>> >> Yes we could do the same in en_non_vif_data, but I think it is not
really necessary and it seems more straightforward just parsing it here,
because:
>> >> 1. We don't need I-P for this ovn-encap-ip configuration change, so
we don't have to persist this data.
>> >> 2. It should be very rare to have more than 5 (or even 3) encap IPs
per node, so the list should be very small and the cost of this parsing
(and sset construct/destroy) is negligible.
>> >> Using svec is also a valid option, but the sset (and array) is used
here just to reuse the sset_from_delimited_string and sset_sort for
convenience. I noticed that the sset_init() call can in fact be removed
because sset_from_delimited_string already does that. I can remove it.
>> >> Does this sound reasonable to you?
>> >
>> >
>> > It makes sense, the main thing it would help with is the need to
destroy the ctx in kinda unexpected places which might be slightly error
prone. However, it being functionally the same it's fine either way.
>> >
>>
>> Thanks Ales. So I will add the below small change. Would you give your
Ack if it looks good to you?
>> -----------------------
>> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
>> index 4ab57fe970fe..6873c02288c6 100644
>> --- a/controller/ovn-controller.c
>> +++ b/controller/ovn-controller.c
>> @@ -4532,7 +4532,6 @@ parse_encap_ips(const struct
ovsrec_open_vswitch_table *ovs_table,
>>          get_chassis_external_id_value(&cfg->external_ids, chassis_id,
>>                                        "ovn-encap-ip", NULL);
>>      struct sset encap_ip_set;
>> -    sset_init(&encap_ip_set);
>>      sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");
>>
>>      /* Sort the ips so that their index is deterministic. */
>> @@ -4615,8 +4614,6 @@ static void init_physical_ctx(struct engine_node
*node,
>>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
>>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
>>
>> -
>> -
>>      struct controller_engine_ctx *ctrl_ctx =
engine_get_context()->client_ctx;
>>      p_ctx->if_mgr = ctrl_ctx->if_mgr;
>> -------------------------
>>
>> Thanks,
>> Han
>
>
>
> Yeah the diff looks ok.
>
> Acked-by: Ales Musil <amusil@redhat.com>
>
> Thanks,
> Ales
>

Thanks Ales, I applied the series with the above diff.

Han

>>
>>
>> > Thanks,
>> > Ales
>> >
>> >>
>> >>
>> >> Thanks,
>> >> Han
>> >>
>> >> >
>> >> >>
>> >> >>  static void init_physical_ctx(struct engine_node *node,
>> >> >>                                struct ed_type_runtime_data
*rt_data,
>> >> >>                                struct ed_type_non_vif_data
*non_vif_data,
>> >> >> @@ -4572,6 +4597,7 @@ static void init_physical_ctx(struct
engine_node *node,
>> >> >>          engine_get_input_data("ct_zones", node);
>> >> >>      struct simap *ct_zones = &ct_zones_data->current;
>> >> >>
>> >> >> +    parse_encap_ips(ovs_table, &p_ctx->n_encap_ips,
&p_ctx->encap_ips);
>> >> >>      p_ctx->sbrec_port_binding_by_name =
sbrec_port_binding_by_name;
>> >> >>      p_ctx->sbrec_port_binding_by_datapath =
sbrec_port_binding_by_datapath;
>> >> >>      p_ctx->port_binding_table = port_binding_table;
>> >> >> @@ -4589,12 +4615,23 @@ static void init_physical_ctx(struct
engine_node *node,
>> >> >>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
>> >> >>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
>> >> >>
>> >> >> +
>> >> >> +
>> >> >>
>> >> >
>> >> > nit: Unrelated change.
>> >>
>> >>
>> >> Ack
>> >> >
>> >> >
>> >> >>
>> >> >>      struct controller_engine_ctx *ctrl_ctx =
engine_get_context()->client_ctx;
>> >> >>      p_ctx->if_mgr = ctrl_ctx->if_mgr;
>> >> >>
>> >> >>      pflow_output_get_debug(node, &p_ctx->debug);
>> >> >>  }
>> >> >>
>> >> >> +static void
>> >> >> +destroy_physical_ctx(struct physical_ctx *p_ctx)
>> >> >> +{
>> >> >> +    for (size_t i = 0; i < p_ctx->n_encap_ips; i++) {
>> >> >> +        free((char *)(p_ctx->encap_ips[i]));
>> >> >> +    }
>> >> >> +    free(p_ctx->encap_ips);
>> >> >> +}
>> >> >> +
>> >> >>  static void *
>> >> >>  en_pflow_output_init(struct engine_node *node OVS_UNUSED,
>> >> >>                               struct engine_arg *arg OVS_UNUSED)
>> >> >> @@ -4631,6 +4668,7 @@ en_pflow_output_run(struct engine_node
*node, void *data)
>> >> >>      struct physical_ctx p_ctx;
>> >> >>      init_physical_ctx(node, rt_data, non_vif_data, &p_ctx);
>> >> >>      physical_run(&p_ctx, pflow_table);
>> >> >> +    destroy_physical_ctx(&p_ctx);
>> >> >>
>> >> >>      engine_set_node_state(node, EN_UPDATED);
>> >> >>  }
>> >> >> @@ -4675,6 +4713,7 @@ pflow_output_if_status_mgr_handler(struct
engine_node *node,
>> >> >>                  bool removed =
sbrec_port_binding_is_deleted(binding);
>> >> >>                  if (!physical_handle_flows_for_lport(binding,
removed, &p_ctx,
>> >> >>
&pfo->flow_table)) {
>> >> >> +                    destroy_physical_ctx(&p_ctx);
>> >> >>                      return false;
>> >> >>                  }
>> >> >>              }
>> >> >> @@ -4684,11 +4723,13 @@ pflow_output_if_status_mgr_handler(struct
engine_node *node,
>> >> >>              bool removed = sbrec_port_binding_is_deleted(pb);
>> >> >>              if (!physical_handle_flows_for_lport(pb, removed,
&p_ctx,
>> >> >>
&pfo->flow_table)) {
>> >> >> +                destroy_physical_ctx(&p_ctx);
>> >> >>                  return false;
>> >> >>              }
>> >> >>          }
>> >> >>          engine_set_node_state(node, EN_UPDATED);
>> >> >>      }
>> >> >> +    destroy_physical_ctx(&p_ctx);
>> >> >>      return true;
>> >> >>  }
>> >> >>
>> >> >> @@ -4715,11 +4756,13 @@
pflow_output_sb_port_binding_handler(struct engine_node *node,
>> >> >>          bool removed = sbrec_port_binding_is_deleted(pb);
>> >> >>          if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
>> >> >>                                               &pfo->flow_table)) {
>> >> >> +            destroy_physical_ctx(&p_ctx);
>> >> >>              return false;
>> >> >>          }
>> >> >>      }
>> >> >>
>> >> >>      engine_set_node_state(node, EN_UPDATED);
>> >> >> +    destroy_physical_ctx(&p_ctx);
>> >> >>      return true;
>> >> >>  }
>> >> >>
>> >> >> @@ -4739,6 +4782,7 @@
pflow_output_sb_multicast_group_handler(struct engine_node *node, void
*data)
>> >> >>      physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table);
>> >> >>
>> >> >>      engine_set_node_state(node, EN_UPDATED);
>> >> >> +    destroy_physical_ctx(&p_ctx);
>> >> >>      return true;
>> >> >>  }
>> >> >>
>> >> >> @@ -4771,6 +4815,7 @@ pflow_output_runtime_data_handler(struct
engine_node *node, void *data)
>> >> >>          if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) {
>> >> >>              /* Fall back to full recompute when a local datapath
>> >> >>               * is added or deleted. */
>> >> >> +            destroy_physical_ctx(&p_ctx);
>> >> >>              return false;
>> >> >>          }
>> >> >>
>> >> >> @@ -4781,12 +4826,14 @@ pflow_output_runtime_data_handler(struct
engine_node *node, void *data)
>> >> >>                  lport->tracked_type == TRACKED_RESOURCE_REMOVED ?
true: false;
>> >> >>              if (!physical_handle_flows_for_lport(lport->pb,
removed, &p_ctx,
>> >> >>
&pfo->flow_table)) {
>> >> >> +                destroy_physical_ctx(&p_ctx);
>> >> >>                  return false;
>> >> >>              }
>> >> >>          }
>> >> >>      }
>> >> >>
>> >> >>      engine_set_node_state(node, EN_UPDATED);
>> >> >> +    destroy_physical_ctx(&p_ctx);
>> >> >>      return true;
>> >> >>  }
>> >> >>
>> >> >> @@ -4843,12 +4890,14 @@
pflow_output_activated_ports_handler(struct engine_node *node, void *data)
>> >> >>          if (pb) {
>> >> >>              if (!physical_handle_flows_for_lport(pb, false,
&p_ctx,
>> >> >>
&pfo->flow_table)) {
>> >> >> +                destroy_physical_ctx(&p_ctx);
>> >> >>                  return false;
>> >> >>              }
>> >> >>              tag_port_as_activated_in_engine(pp);
>> >> >>          }
>> >> >>      }
>> >> >>      engine_set_node_state(node, EN_UPDATED);
>> >> >> +    destroy_physical_ctx(&p_ctx);
>> >> >>      return true;
>> >> >>  }
>> >> >>
>> >> >> diff --git a/controller/physical.c b/controller/physical.c
>> >> >> index e93bfbd2cffb..c192aed751d5 100644
>> >> >> --- a/controller/physical.c
>> >> >> +++ b/controller/physical.c
>> >> >> @@ -71,6 +71,8 @@ struct tunnel {
>> >> >>  static void
>> >> >>  load_logical_ingress_metadata(const struct sbrec_port_binding
*binding,
>> >> >>                                const struct zone_ids *zone_ids,
>> >> >> +                              size_t n_encap_ips,
>> >> >> +                              const char **encap_ips,
>> >> >>                                struct ofpbuf *ofpacts_p);
>> >> >>  static int64_t get_vxlan_port_key(int64_t port_key);
>> >> >>
>> >> >> @@ -138,14 +140,14 @@ put_resubmit(uint8_t table_id, struct ofpbuf
*ofpacts)
>> >> >>   */
>> >> >>  static struct chassis_tunnel *
>> >> >>  get_port_binding_tun(const struct sbrec_encap *remote_encap,
>> >> >> -                     const struct sbrec_encap *local_encap,
>> >> >>                       const struct sbrec_chassis *chassis,
>> >> >> -                     const struct hmap *chassis_tunnels)
>> >> >> +                     const struct hmap *chassis_tunnels,
>> >> >> +                     const char *local_encap_ip)
>> >> >>  {
>> >> >>      struct chassis_tunnel *tun = NULL;
>> >> >>      tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
>> >> >>                                remote_encap ? remote_encap->ip :
NULL,
>> >> >> -                              local_encap ? local_encap->ip :
NULL);
>> >> >> +                              local_encap_ip);
>> >> >>
>> >> >>      if (!tun) {
>> >> >>          tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
NULL, NULL);
>> >> >> @@ -329,7 +331,8 @@ find_additional_encap_for_chassis(const struct
sbrec_port_binding *pb,
>> >> >>  static struct ovs_list *
>> >> >>  get_remote_tunnels(const struct sbrec_port_binding *binding,
>> >> >>                     const struct sbrec_chassis *chassis,
>> >> >> -                   const struct hmap *chassis_tunnels)
>> >> >> +                   const struct hmap *chassis_tunnels,
>> >> >> +                   const char *local_encap_ip)
>> >> >>  {
>> >> >>      const struct chassis_tunnel *tun;
>> >> >>
>> >> >> @@ -337,8 +340,8 @@ get_remote_tunnels(const struct
sbrec_port_binding *binding,
>> >> >>      ovs_list_init(tunnels);
>> >> >>
>> >> >>      if (binding->chassis && binding->chassis != chassis) {
>> >> >> -        tun = get_port_binding_tun(binding->encap, NULL,
binding->chassis,
>> >> >> -                                   chassis_tunnels);
>> >> >> +        tun = get_port_binding_tun(binding->encap,
binding->chassis,
>> >> >> +                chassis_tunnels, local_encap_ip);
>> >> >>          if (!tun) {
>> >> >>              static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
>> >> >>              VLOG_WARN_RL(
>> >> >> @@ -358,9 +361,9 @@ get_remote_tunnels(const struct
sbrec_port_binding *binding,
>> >> >>          }
>> >> >>          const struct sbrec_encap *additional_encap;
>> >> >>          additional_encap =
find_additional_encap_for_chassis(binding, chassis);
>> >> >> -        tun = get_port_binding_tun(additional_encap, NULL,
>> >> >> +        tun = get_port_binding_tun(additional_encap,
>> >> >>                                     binding->additional_chassis[i],
>> >> >> -                                   chassis_tunnels);
>> >> >> +                                   chassis_tunnels,
local_encap_ip);
>> >> >>          if (!tun) {
>> >> >>              static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 1);
>> >> >>              VLOG_WARN_RL(
>> >> >> @@ -384,36 +387,48 @@ put_remote_port_redirect_overlay(const
struct sbrec_port_binding *binding,
>> >> >>                                   struct ofpbuf *ofpacts_p,
>> >> >>                                   const struct sbrec_chassis
*chassis,
>> >> >>                                   const struct hmap
*chassis_tunnels,
>> >> >> +                                 size_t n_encap_ips,
>> >> >> +                                 const char **encap_ips,
>> >> >>                                   struct ovn_desired_flow_table
*flow_table)
>> >> >>  {
>> >> >>      /* Setup encapsulation */
>> >> >> -    struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
>> >> >> -                                               chassis_tunnels);
>> >> >> -    if (!ovs_list_is_empty(tuns)) {
>> >> >> -        bool is_vtep_port = !strcmp(binding->type, "vtep");
>> >> >> -        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check
for ARP/ND
>> >> >> -         * responder in L3 networks. */
>> >> >> -        if (is_vtep_port) {
>> >> >> -            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
ofpacts_p);
>> >> >> -        }
>> >> >> +    for (size_t i = 0; i < n_encap_ips; i++) {
>> >> >> +        struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p);
>> >> >>
>> >> >> +
>> >> >> +        match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0,
i << 16,
>> >> >> +                             (uint32_t) 0xFFFF << 16);
>> >> >> +        struct ovs_list *tuns = get_remote_tunnels(binding,
chassis,
>> >> >> +
chassis_tunnels,
>> >> >> +                                                   encap_ips[i]);
>> >> >> +        if (!ovs_list_is_empty(tuns)) {
>> >> >> +            bool is_vtep_port = !strcmp(binding->type, "vtep");
>> >> >> +            /* rewrite MFF_IN_PORT to bypass OpenFlow loopback
check for ARP/ND
>> >> >> +             * responder in L3 networks. */
>> >> >> +            if (is_vtep_port) {
>> >> >> +                put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0,
16,
>> >> >> +                         ofpacts_clone);
>> >> >> +            }
>> >> >>
>> >> >> -        struct tunnel *tun;
>> >> >> -        LIST_FOR_EACH (tun, list_node, tuns) {
>> >> >> -            put_encapsulation(mff_ovn_geneve, tun->tun,
>> >> >> -                              binding->datapath, port_key,
is_vtep_port,
>> >> >> -                              ofpacts_p);
>> >> >> -            ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport;
>> >> >> +            struct tunnel *tun;
>> >> >> +            LIST_FOR_EACH (tun, list_node, tuns) {
>> >> >> +                put_encapsulation(mff_ovn_geneve, tun->tun,
>> >> >> +                                  binding->datapath, port_key,
is_vtep_port,
>> >> >> +                                  ofpacts_clone);
>> >> >> +                ofpact_put_OUTPUT(ofpacts_clone)->port =
tun->tun->ofport;
>> >> >> +            }
>> >> >> +            put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone);
>> >> >> +            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT,
100,
>> >> >> +                            binding->header_.uuid.parts[0], match,
>> >> >> +                            ofpacts_clone,
&binding->header_.uuid);
>> >> >>          }
>> >> >> -        put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
>> >> >> -        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
>> >> >> -                        binding->header_.uuid.parts[0], match,
ofpacts_p,
>> >> >> -                        &binding->header_.uuid);
>> >> >> -    }
>> >> >> -    struct tunnel *tun_elem;
>> >> >> -    LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
>> >> >> -        free(tun_elem);
>> >> >> +        struct tunnel *tun_elem;
>> >> >> +        LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
>> >> >> +            free(tun_elem);
>> >> >> +        }
>> >> >> +        free(tuns);
>> >> >> +
>> >> >> +        ofpbuf_delete(ofpacts_clone);
>> >> >>      }
>> >> >> -    free(tuns);
>> >> >>  }
>> >> >>
>> >> >>  static void
>> >> >> @@ -673,7 +688,8 @@ put_replace_chassis_mac_flows(const struct
simap *ct_zones,
>> >> >>          if (tag) {
>> >> >>              ofpact_put_STRIP_VLAN(ofpacts_p);
>> >> >>          }
>> >> >> -        load_logical_ingress_metadata(localnet_port, &zone_ids,
ofpacts_p);
>> >> >> +        load_logical_ingress_metadata(localnet_port, &zone_ids,
0, NULL,
>> >> >> +                                      ofpacts_p);
>> >> >>          replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p);
>> >> >>          replace_mac->mac = router_port_mac;
>> >> >>
>> >> >> @@ -853,7 +869,7 @@ put_zones_ofpacts(const struct zone_ids
*zone_ids, struct ofpbuf *ofpacts_p)
>> >> >>  {
>> >> >>      if (zone_ids) {
>> >> >>          if (zone_ids->ct) {
>> >> >> -            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32,
ofpacts_p);
>> >> >> +            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16,
ofpacts_p);
>> >> >>          }
>> >> >>          if (zone_ids->dnat) {
>> >> >>              put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32,
ofpacts_p);
>> >> >> @@ -1014,9 +1030,23 @@ put_local_common_flows(uint32_t dp_key,
>> >> >>      }
>> >> >>  }
>> >> >>
>> >> >> +static size_t
>> >> >> +encap_ip_to_id(size_t n_encap_ips, const char **encap_ips,
>> >> >> +               const char *ip)
>> >> >> +{
>> >> >> +    for (size_t i = 0; i < n_encap_ips; i++) {
>> >> >> +        if (!strcmp(ip, encap_ips[i])) {
>> >> >> +            return i;
>> >> >> +        }
>> >> >> +    }
>> >> >> +    return 0;
>> >> >> +}
>> >> >> +
>> >> >>  static void
>> >> >>  load_logical_ingress_metadata(const struct sbrec_port_binding
*binding,
>> >> >>                                const struct zone_ids *zone_ids,
>> >> >> +                              size_t n_encap_ips,
>> >> >> +                              const char **encap_ips,
>> >> >>                                struct ofpbuf *ofpacts_p)
>> >> >>  {
>> >> >>      put_zones_ofpacts(zone_ids, ofpacts_p);
>> >> >> @@ -1026,6 +1056,14 @@ load_logical_ingress_metadata(const struct
sbrec_port_binding *binding,
>> >> >>      uint32_t port_key = binding->tunnel_key;
>> >> >>      put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p);
>> >> >>      put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p);
>> >> >> +
>> >> >> +    /* Default encap_id 0. */
>> >> >> +    size_t encap_id = 0;
>> >> >> +    if (encap_ips && binding->encap) {
>> >> >> +        encap_id = encap_ip_to_id(n_encap_ips, encap_ips,
>> >> >> +                                  binding->encap->ip);
>> >> >> +    }
>> >> >> +    put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p);
>> >> >>  }
>> >> >>
>> >> >>  static const struct sbrec_port_binding *
>> >> >> @@ -1070,7 +1108,7 @@ setup_rarp_activation_strategy(const struct
sbrec_port_binding *binding,
>> >> >>      match_set_dl_type(&match, htons(ETH_TYPE_RARP));
>> >> >>      match_set_in_port(&match, ofport);
>> >> >>
>> >> >> -    load_logical_ingress_metadata(binding, zone_ids, &ofpacts);
>> >> >> +    load_logical_ingress_metadata(binding, zone_ids, 0, NULL,
&ofpacts);
>> >> >>
>> >> >>      encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
>> >> >>                           NX_CTLR_NO_METER, &ofpacts);
>> >> >> @@ -1384,7 +1422,7 @@ enforce_tunneling_for_multichassis_ports(
>> >> >>      }
>> >> >>
>> >> >>      struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
>> >> >> -                                               chassis_tunnels);
>> >> >> +                                               chassis_tunnels,
NULL);
>> >> >>      if (ovs_list_is_empty(tuns)) {
>> >> >>          free(tuns);
>> >> >>          return;
>> >> >> @@ -1446,6 +1484,8 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >> >>                        const struct sbrec_chassis *chassis,
>> >> >>                        const struct physical_debug *debug,
>> >> >>                        const struct if_status_mgr *if_mgr,
>> >> >> +                      size_t n_encap_ips,
>> >> >> +                      const char **encap_ips,
>> >> >>                        struct ovn_desired_flow_table *flow_table,
>> >> >>                        struct ofpbuf *ofpacts_p)
>> >> >>  {
>> >> >> @@ -1479,9 +1519,10 @@ consider_port_binding(struct
ovsdb_idl_index *sbrec_port_binding_by_name,
>> >> >>          ofpact_put_CT_CLEAR(ofpacts_p);
>> >> >>          put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
>> >> >>          put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
>> >> >> -        put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
>> >> >> +        put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
>> >> >>          struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
>> >> >> -        load_logical_ingress_metadata(peer, &peer_zones,
ofpacts_p);
>> >> >> +        load_logical_ingress_metadata(peer, &peer_zones,
n_encap_ips,
>> >> >> +                                      encap_ips, ofpacts_p);
>> >> >>          put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
>> >> >>          put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
>> >> >>          for (int i = 0; i < MFF_N_LOG_REGS; i++) {
>> >> >> @@ -1697,7 +1738,8 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >> >>           * as we're going to remove this with ofpbuf_pull()
later. */
>> >> >>          uint32_t ofpacts_orig_size = ofpacts_p->size;
>> >> >>
>> >> >> -        load_logical_ingress_metadata(binding, &zone_ids,
ofpacts_p);
>> >> >> +        load_logical_ingress_metadata(binding, &zone_ids,
n_encap_ips,
>> >> >> +                                      encap_ips, ofpacts_p);
>> >> >>
>> >> >>          if (!strcmp(binding->type, "localport")) {
>> >> >>              /* mark the packet as incoming from a localport */
>> >> >> @@ -1904,7 +1946,7 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >> >>      } else {
>> >> >>          put_remote_port_redirect_overlay(
>> >> >>              binding, mff_ovn_geneve, port_key, &match, ofpacts_p,
>> >> >> -            chassis, chassis_tunnels, flow_table);
>> >> >> +            chassis, chassis_tunnels, n_encap_ips, encap_ips,
flow_table);
>> >> >>      }
>> >> >>  out:
>> >> >>      if (ha_ch_ordered) {
>> >> >> @@ -2102,7 +2144,7 @@ consider_mc_group(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
>> >> >>
>> >> >>          int zone_id = simap_get(ct_zones, port->logical_port);
>> >> >>          if (zone_id) {
>> >> >> -            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts);
>> >> >> +            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts);
>> >> >>          }
>> >> >>
>> >> >>          const char *lport_name = (port->parent_port &&
*port->parent_port) ?
>> >> >> @@ -2278,7 +2320,10 @@ physical_eval_port_binding(struct
physical_ctx *p_ctx,
>> >> >>                            p_ctx->patch_ofports,
>> >> >>                            p_ctx->chassis_tunnels,
>> >> >>                            pb, p_ctx->chassis, &p_ctx->debug,
>> >> >> -                          p_ctx->if_mgr, flow_table, &ofpacts);
>> >> >> +                          p_ctx->if_mgr,
>> >> >> +                          p_ctx->n_encap_ips,
>> >> >> +                          p_ctx->encap_ips,
>> >> >> +                          flow_table, &ofpacts);
>> >> >>      ofpbuf_uninit(&ofpacts);
>> >> >>  }
>> >> >>
>> >> >> @@ -2402,7 +2447,10 @@ physical_run(struct physical_ctx *p_ctx,
>> >> >>                                p_ctx->patch_ofports,
>> >> >>                                p_ctx->chassis_tunnels, binding,
>> >> >>                                p_ctx->chassis, &p_ctx->debug,
>> >> >> -                              p_ctx->if_mgr, flow_table,
&ofpacts);
>> >> >> +                              p_ctx->if_mgr,
>> >> >> +                              p_ctx->n_encap_ips,
>> >> >> +                              p_ctx->encap_ips,
>> >> >> +                              flow_table, &ofpacts);
>> >> >>      }
>> >> >>
>> >> >>      /* Handle output to multicast groups, in tables 40 and 41. */
>> >> >> diff --git a/controller/physical.h b/controller/physical.h
>> >> >> index 1f1ed55efa16..7fe8ee3c18ed 100644
>> >> >> --- a/controller/physical.h
>> >> >> +++ b/controller/physical.h
>> >> >> @@ -66,6 +66,8 @@ struct physical_ctx {
>> >> >>      struct shash *local_bindings;
>> >> >>      struct simap *patch_ofports;
>> >> >>      struct hmap *chassis_tunnels;
>> >> >> +    size_t n_encap_ips;
>> >> >> +    const char **encap_ips;
>> >> >>      struct physical_debug debug;
>> >> >>  };
>> >> >>
>> >> >> diff --git a/include/ovn/logical-fields.h
b/include/ovn/logical-fields.h
>> >> >> index 272277ec4fa0..d3455a462a0d 100644
>> >> >> --- a/include/ovn/logical-fields.h
>> >> >> +++ b/include/ovn/logical-fields.h
>> >> >> @@ -37,7 +37,9 @@ enum ovn_controller_event {
>> >> >>  #define MFF_LOG_SNAT_ZONE  MFF_REG12  /* conntrack snat zone for
gateway router
>> >> >>                                         * (32 bits). */
>> >> >>  #define MFF_LOG_CT_ZONE    MFF_REG13  /* Logical conntrack zone
for lports
>> >> >> -                                       * (32 bits). */
>> >> >> +                                       * (0..15 of the 32 bits).
*/
>> >> >> +#define MFF_LOG_ENCAP_ID   MFF_REG13  /* Encap ID for lports
>> >> >> +                                       * (16..31 of the 32 bits).
*/
>> >> >>  #define MFF_LOG_INPORT     MFF_REG14  /* Logical input port (32
bits). */
>> >> >>  #define MFF_LOG_OUTPORT    MFF_REG15  /* Logical output port (32
bits). */
>> >> >>
>> >> >> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
>> >> >> index 96294fe2b795..bfd8680cedfc 100644
>> >> >> --- a/ovn-architecture.7.xml
>> >> >> +++ b/ovn-architecture.7.xml
>> >> >> @@ -1247,8 +1247,8 @@
>> >> >>        chassis.  This is initialized to 0 at the beginning of the
logical
>> >> >>          <!-- Keep the following in sync with MFF_LOG_CT_ZONE in
>> >> >>               ovn/lib/logical-fields.h. -->
>> >> >> -      ingress pipeline.  OVN stores this in Open vSwitch
extension register
>> >> >> -      number 13.
>> >> >> +      ingress pipeline.  OVN stores this in the lower 16 bits of
the Open
>> >> >> +      vSwitch extension register number 13.
>> >> >>      </dd>
>> >> >>
>> >> >>      <dt>conntrack zone fields for routers</dt>
>> >> >> @@ -1263,6 +1263,20 @@
>> >> >>        traffic (for SNATing) in Open vSwitch extension register
number 12.
>> >> >>      </dd>
>> >> >>
>> >> >> +    <dt>Encap ID for logical ports</dt>
>> >> >> +    <dd>
>> >> >> +      A field that records an ID that indicates which
encapsulation IP should
>> >> >> +      be used when sending packets to a remote chassis, according
to the
>> >> >> +      original input logical port. This is useful when there are
multiple IPs
>> >> >> +      available for encapsulation. The value only has local
significance and is
>> >> >> +      not meaningful between chassis. This is initialized to 0 at
the beginning
>> >> >> +      of the logical
>> >> >> +        <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in
>> >> >> +             ovn/lib/logical-fields.h. -->
>> >> >> +      ingress pipeline.  OVN stores this in the higher 16 bits of
the Open
>> >> >> +      vSwitch extension register number 13.
>> >> >> +    </dd>
>> >> >> +
>> >> >>      <dt>logical flow flags</dt>
>> >> >>      <dd>
>> >> >>        The logical flags are intended to handle keeping context
between
>> >> >> diff --git a/tests/ovn.at b/tests/ovn.at
>> >> >> index 243fe0b8246c..e7f0c9681f60 100644
>> >> >> --- a/tests/ovn.at
>> >> >> +++ b/tests/ovn.at
>> >> >> @@ -30335,19 +30335,33 @@ AT_CLEANUP
>> >> >>
>> >> >>
>> >> >>  OVN_FOR_EACH_NORTHD([
>> >> >> -AT_SETUP([multiple encap ips tunnel creation])
>> >> >> +AT_SETUP([multiple encap ips selection based on VIF's encap_ip])
>> >> >> +AT_SKIP_IF([test $HAVE_SCAPY = no])
>> >> >>  ovn_start
>> >> >>  net_add n1
>> >> >>
>> >> >> +ovn-nbctl ls-add ls1
>> >> >> +
>> >> >>  # 2 HVs, each with 2 encap-ips.
>> >> >> +# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY.
>> >> >>  for i in 1 2; do
>> >> >>      sim_add hv$i
>> >> >>      as hv$i
>> >> >>      ovs-vsctl add-br br-phys-$j
>> >> >>      ovn_attach n1 br-phys-$j 192.168.0.${i}1
>> >> >>      ovs-vsctl set open .
external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2
>> >> >> +
>> >> >> +    for j in 1 2; do
>> >> >> +        ovs-vsctl add-port br-int vif$i$j -- set Interface
vif$i$j \
>> >> >> +            external_ids:iface-id=lsp$i$j \
>> >> >> +            external_ids:encap-ip=192.168.0.$i$j \
>> >> >> +            options:tx_pcap=hv$i/vif$i$j-tx.pcap
options:rxq_pcap=hv$i/vif$i$j-rx.pcap
>> >> >> +        ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses
lsp$i$j "f0:00:00:00:00:$i$j 10.0.0.$i$j"
>> >> >> +
>> >> >> +    done
>> >> >>  done
>> >> >>
>> >> >> +wait_for_ports_up
>> >> >>  check ovn-nbctl --wait=hv sync
>> >> >>
>> >> >>  check_tunnel_port() {
>> >> >> @@ -30376,6 +30390,41 @@ check_tunnel_port hv2 br-int
hv1@192.168.0.12%192.168.0.21
>> >> >>  check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22
>> >> >>  check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22
>> >> >>
>> >> >> +vif_to_hv() {
>> >> >> +    case $1 in dnl (
>> >> >> +        vif[[1]]?) echo hv1 ;; dnl (
>> >> >> +        vif[[2]]?) echo hv2 ;; dnl (
>> >> >> +        *) AT_FAIL_IF([:]) ;;
>> >> >> +    esac
>> >> >> +}
>> >> >> +
>> >> >> +# check_packet_tunnel SRC_VIF DST_VIF
>> >> >> +# Verify that a packet from SRC_VIF to DST_VIF goes through the
corresponding
>> >> >> +# tunnel that matches the VIFs' encap_ip configurations.
>> >> >> +check_packet_tunnel() {
>> >> >> +    local src=$1 dst=$2
>> >> >> +    local src_mac=f0:00:00:00:00:$src
>> >> >> +    local dst_mac=f0:00:00:00:00:$dst
>> >> >> +    local src_ip=10.0.0.$src
>> >> >> +    local dst_ip=10.0.0.$dst
>> >> >> +    local local_encap_ip=192.168.0.$src
>> >> >> +    local remote_encap_ip=192.168.0.$dst
>> >> >> +    local packet=$(fmt_pkt "Ether(dst='${dst_mac}',
src='${src_mac}')/ \
>> >> >> +                            IP(dst='${dst_ip}', src='${src_ip}')/
\
>> >> >> +                            ICMP(type=8)")
>> >> >> +    hv=`vif_to_hv vif$src`
>> >> >> +    as $hv
>> >> >> +    echo "vif$src -> vif$dst should go through tunnel
$local_encap_ip -> $remote_encap_ip"
>> >> >> +    tunnel_ofport=$(ovs-vsctl --bare --column=ofport find
interface options:local_ip=$local_encap_ip
options:remote_ip=$remote_encap_ip)
>> >> >> +    AT_CHECK([test $(ovs-appctl ofproto/trace br-int
in_port=vif$src $packet | grep "output\:" | awk -F ':' '{ print $2 }') ==
$tunnel_ofport])
>> >> >> +}
>> >> >> +
>> >> >> +for i in 1 2; do
>> >> >> +    for j in 1 2; do
>> >> >> +        check_packet_tunnel 1$i 2$j
>> >> >> +    done
>> >> >> +done
>> >> >> +
>> >> >>  OVN_CLEANUP([hv1],[hv2])
>> >> >>  AT_CLEANUP
>> >> >>  ])
>> >> >> --
>> >> >> 2.38.1
>> >> >>
>> >> >> _______________________________________________
>> >> >> dev mailing list
>> >> >> dev@openvswitch.org
>> >> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>> >> >>
>> >> >
>> >> > Thanks,
>> >> > Ales
>> >> >
>> >> > --
>> >> >
>> >> > Ales Musil
>> >> >
>> >> > Senior Software Engineer - OVN Core
>> >> >
>> >> > Red Hat EMEA
>> >> >
>> >> > amusil@redhat.com
>> >
>> >
>> >
>> > --
>> >
>> > Ales Musil
>> >
>> > Senior Software Engineer - OVN Core
>> >
>> > Red Hat EMEA
>> >
>> > amusil@redhat.com
>
>
>
> --
>
> Ales Musil
>
> Senior Software Engineer - OVN Core
>
> Red Hat EMEA
>
> amusil@redhat.com
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index 5f267b4c64cc..5a3eed608617 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,9 @@  Post v23.09.0
   - ovn-northd-ddlog has been removed.
   - A new LSP option "enable_router_port_acl" has been added to enable
     conntrack for the router port whose peer is l3dgw_port if set it true.
+  - Support selecting encapsulation IP based on the source/destination VIF's
+    settting. See ovn-controller(8) 'external_ids:ovn-encap-ip' for more
+    details.
 
 OVN v23.09.0 - 15 Sep 2023
 --------------------------
diff --git a/controller/chassis.c b/controller/chassis.c
index a6f13ccc42d5..55f2beb37674 100644
--- a/controller/chassis.c
+++ b/controller/chassis.c
@@ -61,7 +61,7 @@  struct ovs_chassis_cfg {
 
     /* Set of encap types parsed from the 'ovn-encap-type' external-id. */
     struct sset encap_type_set;
-    /* Set of encap IPs parsed from the 'ovn-encap-type' external-id. */
+    /* Set of encap IPs parsed from the 'ovn-encap-ip' external-id. */
     struct sset encap_ip_set;
     /* Interface type list formatted in the OVN-SB Chassis required format. */
     struct ds iface_types;
diff --git a/controller/local_data.c b/controller/local_data.c
index a9092783958f..8606414f8728 100644
--- a/controller/local_data.c
+++ b/controller/local_data.c
@@ -514,7 +514,7 @@  chassis_tunnels_destroy(struct hmap *chassis_tunnels)
  */
 struct chassis_tunnel *
 chassis_tunnel_find(const struct hmap *chassis_tunnels, const char *chassis_id,
-                    char *remote_encap_ip, char *local_encap_ip)
+                    char *remote_encap_ip, const char *local_encap_ip)
 {
     /*
      * If the specific encap_ip is given, look for the chassisid_ip entry,
diff --git a/controller/local_data.h b/controller/local_data.h
index bab95bcc3824..ca3905bd20e6 100644
--- a/controller/local_data.h
+++ b/controller/local_data.h
@@ -150,7 +150,7 @@  bool local_nonvif_data_handle_ovs_iface_changes(
 struct chassis_tunnel *chassis_tunnel_find(const struct hmap *chassis_tunnels,
                                            const char *chassis_id,
                                            char *remote_encap_ip,
-                                           char *local_encap_ip);
+                                           const char *local_encap_ip);
 
 bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels,
                                const char *chassis_name,
diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
index efa65e3fd927..5ebef048d721 100644
--- a/controller/ovn-controller.8.xml
+++ b/controller/ovn-controller.8.xml
@@ -176,10 +176,32 @@ 
 
       <dt><code>external_ids:ovn-encap-ip</code></dt>
       <dd>
-        The IP address that a chassis should use to connect to this node
-        using encapsulation types specified by
-        <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs
-        may be specified with a comma-separated list.
+        <p>
+          The IP address that a chassis should use to connect to this node
+          using encapsulation types specified by
+          <code>external_ids:ovn-encap-type</code>. Multiple encapsulation IPs
+          may be specified with a comma-separated list.
+        </p>
+        <p>
+          In scenarios where multiple encapsulation IPs are present, distinct
+          tunnels are established for each remote chassis. These tunnels are
+          differentiated by setting unique <code>options:local_ip</code> and
+          <code>options:remote_ip</code> values in the tunnel interface. When
+          transmitting a packet to a remote chassis, the selection of local_ip
+          is guided by the <code>Interface:external_ids:encap-ip</code> from
+          the local OVSDB, corresponding to the VIF originating the packet, if
+          specified. The <code>Interface:external_ids:encap-ip</code> setting
+          of the VIF is also populated to the <code>Port_Binding</code>
+          table in the OVN SB database via the <code>encap</code> column.
+          Consequently, when a remote chassis needs to send a packet to a
+          port-binding associated with this VIF, it utilizes the tunnel with
+          the appropriate <code>options:remote_ip</code> that matches the
+          <code>ip</code> in <code>Port_Binding:encap</code>. This mechanism
+          is particularly beneficial for chassis with multiple physical
+          interfaces designated for tunneling, where each interface is
+          optimized for handling specific traffic associated with particular
+          VIFs.
+        </p>
       </dd>
 
       <dt><code>external_ids:ovn-encap-df_default</code></dt>
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 856e5e270822..4ab57fe970fe 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -4521,6 +4521,31 @@  struct ed_type_pflow_output {
     struct physical_debug debug;
 };
 
+static void
+parse_encap_ips(const struct ovsrec_open_vswitch_table *ovs_table,
+                size_t *n_encap_ips, const char * **encap_ips)
+{
+    const struct ovsrec_open_vswitch *cfg =
+        ovsrec_open_vswitch_table_first(ovs_table);
+    const char *chassis_id = get_ovs_chassis_id(ovs_table);
+    const char *encap_ips_str =
+        get_chassis_external_id_value(&cfg->external_ids, chassis_id,
+                                      "ovn-encap-ip", NULL);
+    struct sset encap_ip_set;
+    sset_init(&encap_ip_set);
+    sset_from_delimited_string(&encap_ip_set, encap_ips_str,  ",");
+
+    /* Sort the ips so that their index is deterministic. */
+    *encap_ips = sset_sort(&encap_ip_set);
+
+    /* Copy the strings so that we can destroy the sset. */
+    for (size_t i = 0; (*encap_ips)[i]; i++) {
+        (*encap_ips)[i] = xstrdup((*encap_ips)[i]);
+    }
+    *n_encap_ips = sset_count(&encap_ip_set);
+    sset_destroy(&encap_ip_set);
+}
+
 static void init_physical_ctx(struct engine_node *node,
                               struct ed_type_runtime_data *rt_data,
                               struct ed_type_non_vif_data *non_vif_data,
@@ -4572,6 +4597,7 @@  static void init_physical_ctx(struct engine_node *node,
         engine_get_input_data("ct_zones", node);
     struct simap *ct_zones = &ct_zones_data->current;
 
+    parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips);
     p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name;
     p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath;
     p_ctx->port_binding_table = port_binding_table;
@@ -4589,12 +4615,23 @@  static void init_physical_ctx(struct engine_node *node,
     p_ctx->patch_ofports = &non_vif_data->patch_ofports;
     p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
 
+
+
     struct controller_engine_ctx *ctrl_ctx = engine_get_context()->client_ctx;
     p_ctx->if_mgr = ctrl_ctx->if_mgr;
 
     pflow_output_get_debug(node, &p_ctx->debug);
 }
 
+static void
+destroy_physical_ctx(struct physical_ctx *p_ctx)
+{
+    for (size_t i = 0; i < p_ctx->n_encap_ips; i++) {
+        free((char *)(p_ctx->encap_ips[i]));
+    }
+    free(p_ctx->encap_ips);
+}
+
 static void *
 en_pflow_output_init(struct engine_node *node OVS_UNUSED,
                              struct engine_arg *arg OVS_UNUSED)
@@ -4631,6 +4668,7 @@  en_pflow_output_run(struct engine_node *node, void *data)
     struct physical_ctx p_ctx;
     init_physical_ctx(node, rt_data, non_vif_data, &p_ctx);
     physical_run(&p_ctx, pflow_table);
+    destroy_physical_ctx(&p_ctx);
 
     engine_set_node_state(node, EN_UPDATED);
 }
@@ -4675,6 +4713,7 @@  pflow_output_if_status_mgr_handler(struct engine_node *node,
                 bool removed = sbrec_port_binding_is_deleted(binding);
                 if (!physical_handle_flows_for_lport(binding, removed, &p_ctx,
                                                      &pfo->flow_table)) {
+                    destroy_physical_ctx(&p_ctx);
                     return false;
                 }
             }
@@ -4684,11 +4723,13 @@  pflow_output_if_status_mgr_handler(struct engine_node *node,
             bool removed = sbrec_port_binding_is_deleted(pb);
             if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
                                                  &pfo->flow_table)) {
+                destroy_physical_ctx(&p_ctx);
                 return false;
             }
         }
         engine_set_node_state(node, EN_UPDATED);
     }
+    destroy_physical_ctx(&p_ctx);
     return true;
 }
 
@@ -4715,11 +4756,13 @@  pflow_output_sb_port_binding_handler(struct engine_node *node,
         bool removed = sbrec_port_binding_is_deleted(pb);
         if (!physical_handle_flows_for_lport(pb, removed, &p_ctx,
                                              &pfo->flow_table)) {
+            destroy_physical_ctx(&p_ctx);
             return false;
         }
     }
 
     engine_set_node_state(node, EN_UPDATED);
+    destroy_physical_ctx(&p_ctx);
     return true;
 }
 
@@ -4739,6 +4782,7 @@  pflow_output_sb_multicast_group_handler(struct engine_node *node, void *data)
     physical_handle_mc_group_changes(&p_ctx, &pfo->flow_table);
 
     engine_set_node_state(node, EN_UPDATED);
+    destroy_physical_ctx(&p_ctx);
     return true;
 }
 
@@ -4771,6 +4815,7 @@  pflow_output_runtime_data_handler(struct engine_node *node, void *data)
         if (tdp->tracked_type != TRACKED_RESOURCE_UPDATED) {
             /* Fall back to full recompute when a local datapath
              * is added or deleted. */
+            destroy_physical_ctx(&p_ctx);
             return false;
         }
 
@@ -4781,12 +4826,14 @@  pflow_output_runtime_data_handler(struct engine_node *node, void *data)
                 lport->tracked_type == TRACKED_RESOURCE_REMOVED ? true: false;
             if (!physical_handle_flows_for_lport(lport->pb, removed, &p_ctx,
                                                  &pfo->flow_table)) {
+                destroy_physical_ctx(&p_ctx);
                 return false;
             }
         }
     }
 
     engine_set_node_state(node, EN_UPDATED);
+    destroy_physical_ctx(&p_ctx);
     return true;
 }
 
@@ -4843,12 +4890,14 @@  pflow_output_activated_ports_handler(struct engine_node *node, void *data)
         if (pb) {
             if (!physical_handle_flows_for_lport(pb, false, &p_ctx,
                                                  &pfo->flow_table)) {
+                destroy_physical_ctx(&p_ctx);
                 return false;
             }
             tag_port_as_activated_in_engine(pp);
         }
     }
     engine_set_node_state(node, EN_UPDATED);
+    destroy_physical_ctx(&p_ctx);
     return true;
 }
 
diff --git a/controller/physical.c b/controller/physical.c
index e93bfbd2cffb..c192aed751d5 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -71,6 +71,8 @@  struct tunnel {
 static void
 load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
                               const struct zone_ids *zone_ids,
+                              size_t n_encap_ips,
+                              const char **encap_ips,
                               struct ofpbuf *ofpacts_p);
 static int64_t get_vxlan_port_key(int64_t port_key);
 
@@ -138,14 +140,14 @@  put_resubmit(uint8_t table_id, struct ofpbuf *ofpacts)
  */
 static struct chassis_tunnel *
 get_port_binding_tun(const struct sbrec_encap *remote_encap,
-                     const struct sbrec_encap *local_encap,
                      const struct sbrec_chassis *chassis,
-                     const struct hmap *chassis_tunnels)
+                     const struct hmap *chassis_tunnels,
+                     const char *local_encap_ip)
 {
     struct chassis_tunnel *tun = NULL;
     tun = chassis_tunnel_find(chassis_tunnels, chassis->name,
                               remote_encap ? remote_encap->ip : NULL,
-                              local_encap ? local_encap->ip : NULL);
+                              local_encap_ip);
 
     if (!tun) {
         tun = chassis_tunnel_find(chassis_tunnels, chassis->name, NULL, NULL);
@@ -329,7 +331,8 @@  find_additional_encap_for_chassis(const struct sbrec_port_binding *pb,
 static struct ovs_list *
 get_remote_tunnels(const struct sbrec_port_binding *binding,
                    const struct sbrec_chassis *chassis,
-                   const struct hmap *chassis_tunnels)
+                   const struct hmap *chassis_tunnels,
+                   const char *local_encap_ip)
 {
     const struct chassis_tunnel *tun;
 
@@ -337,8 +340,8 @@  get_remote_tunnels(const struct sbrec_port_binding *binding,
     ovs_list_init(tunnels);
 
     if (binding->chassis && binding->chassis != chassis) {
-        tun = get_port_binding_tun(binding->encap, NULL, binding->chassis,
-                                   chassis_tunnels);
+        tun = get_port_binding_tun(binding->encap, binding->chassis,
+                chassis_tunnels, local_encap_ip);
         if (!tun) {
             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
             VLOG_WARN_RL(
@@ -358,9 +361,9 @@  get_remote_tunnels(const struct sbrec_port_binding *binding,
         }
         const struct sbrec_encap *additional_encap;
         additional_encap = find_additional_encap_for_chassis(binding, chassis);
-        tun = get_port_binding_tun(additional_encap, NULL,
+        tun = get_port_binding_tun(additional_encap,
                                    binding->additional_chassis[i],
-                                   chassis_tunnels);
+                                   chassis_tunnels, local_encap_ip);
         if (!tun) {
             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
             VLOG_WARN_RL(
@@ -384,36 +387,48 @@  put_remote_port_redirect_overlay(const struct sbrec_port_binding *binding,
                                  struct ofpbuf *ofpacts_p,
                                  const struct sbrec_chassis *chassis,
                                  const struct hmap *chassis_tunnels,
+                                 size_t n_encap_ips,
+                                 const char **encap_ips,
                                  struct ovn_desired_flow_table *flow_table)
 {
     /* Setup encapsulation */
-    struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
-                                               chassis_tunnels);
-    if (!ovs_list_is_empty(tuns)) {
-        bool is_vtep_port = !strcmp(binding->type, "vtep");
-        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND
-         * responder in L3 networks. */
-        if (is_vtep_port) {
-            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_p);
-        }
+    for (size_t i = 0; i < n_encap_ips; i++) {
+        struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p);
+
+        match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16,
+                             (uint32_t) 0xFFFF << 16);
+        struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
+                                                   chassis_tunnels,
+                                                   encap_ips[i]);
+        if (!ovs_list_is_empty(tuns)) {
+            bool is_vtep_port = !strcmp(binding->type, "vtep");
+            /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND
+             * responder in L3 networks. */
+            if (is_vtep_port) {
+                put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16,
+                         ofpacts_clone);
+            }
 
-        struct tunnel *tun;
-        LIST_FOR_EACH (tun, list_node, tuns) {
-            put_encapsulation(mff_ovn_geneve, tun->tun,
-                              binding->datapath, port_key, is_vtep_port,
-                              ofpacts_p);
-            ofpact_put_OUTPUT(ofpacts_p)->port = tun->tun->ofport;
+            struct tunnel *tun;
+            LIST_FOR_EACH (tun, list_node, tuns) {
+                put_encapsulation(mff_ovn_geneve, tun->tun,
+                                  binding->datapath, port_key, is_vtep_port,
+                                  ofpacts_clone);
+                ofpact_put_OUTPUT(ofpacts_clone)->port = tun->tun->ofport;
+            }
+            put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone);
+            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
+                            binding->header_.uuid.parts[0], match,
+                            ofpacts_clone, &binding->header_.uuid);
         }
-        put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_p);
-        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
-                        binding->header_.uuid.parts[0], match, ofpacts_p,
-                        &binding->header_.uuid);
-    }
-    struct tunnel *tun_elem;
-    LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
-        free(tun_elem);
+        struct tunnel *tun_elem;
+        LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
+            free(tun_elem);
+        }
+        free(tuns);
+
+        ofpbuf_delete(ofpacts_clone);
     }
-    free(tuns);
 }
 
 static void
@@ -673,7 +688,8 @@  put_replace_chassis_mac_flows(const struct simap *ct_zones,
         if (tag) {
             ofpact_put_STRIP_VLAN(ofpacts_p);
         }
-        load_logical_ingress_metadata(localnet_port, &zone_ids, ofpacts_p);
+        load_logical_ingress_metadata(localnet_port, &zone_ids, 0, NULL,
+                                      ofpacts_p);
         replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p);
         replace_mac->mac = router_port_mac;
 
@@ -853,7 +869,7 @@  put_zones_ofpacts(const struct zone_ids *zone_ids, struct ofpbuf *ofpacts_p)
 {
     if (zone_ids) {
         if (zone_ids->ct) {
-            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
+            put_load(zone_ids->ct, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
         }
         if (zone_ids->dnat) {
             put_load(zone_ids->dnat, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
@@ -1014,9 +1030,23 @@  put_local_common_flows(uint32_t dp_key,
     }
 }
 
+static size_t
+encap_ip_to_id(size_t n_encap_ips, const char **encap_ips,
+               const char *ip)
+{
+    for (size_t i = 0; i < n_encap_ips; i++) {
+        if (!strcmp(ip, encap_ips[i])) {
+            return i;
+        }
+    }
+    return 0;
+}
+
 static void
 load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
                               const struct zone_ids *zone_ids,
+                              size_t n_encap_ips,
+                              const char **encap_ips,
                               struct ofpbuf *ofpacts_p)
 {
     put_zones_ofpacts(zone_ids, ofpacts_p);
@@ -1026,6 +1056,14 @@  load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
     uint32_t port_key = binding->tunnel_key;
     put_load(dp_key, MFF_LOG_DATAPATH, 0, 64, ofpacts_p);
     put_load(port_key, MFF_LOG_INPORT, 0, 32, ofpacts_p);
+
+    /* Default encap_id 0. */
+    size_t encap_id = 0;
+    if (encap_ips && binding->encap) {
+        encap_id = encap_ip_to_id(n_encap_ips, encap_ips,
+                                  binding->encap->ip);
+    }
+    put_load(encap_id, MFF_LOG_ENCAP_ID, 16, 16, ofpacts_p);
 }
 
 static const struct sbrec_port_binding *
@@ -1070,7 +1108,7 @@  setup_rarp_activation_strategy(const struct sbrec_port_binding *binding,
     match_set_dl_type(&match, htons(ETH_TYPE_RARP));
     match_set_in_port(&match, ofport);
 
-    load_logical_ingress_metadata(binding, zone_ids, &ofpacts);
+    load_logical_ingress_metadata(binding, zone_ids, 0, NULL, &ofpacts);
 
     encode_controller_op(ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
                          NX_CTLR_NO_METER, &ofpacts);
@@ -1384,7 +1422,7 @@  enforce_tunneling_for_multichassis_ports(
     }
 
     struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
-                                               chassis_tunnels);
+                                               chassis_tunnels, NULL);
     if (ovs_list_is_empty(tuns)) {
         free(tuns);
         return;
@@ -1446,6 +1484,8 @@  consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                       const struct sbrec_chassis *chassis,
                       const struct physical_debug *debug,
                       const struct if_status_mgr *if_mgr,
+                      size_t n_encap_ips,
+                      const char **encap_ips,
                       struct ovn_desired_flow_table *flow_table,
                       struct ofpbuf *ofpacts_p)
 {
@@ -1479,9 +1519,10 @@  consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
         ofpact_put_CT_CLEAR(ofpacts_p);
         put_load(0, MFF_LOG_DNAT_ZONE, 0, 32, ofpacts_p);
         put_load(0, MFF_LOG_SNAT_ZONE, 0, 32, ofpacts_p);
-        put_load(0, MFF_LOG_CT_ZONE, 0, 32, ofpacts_p);
+        put_load(0, MFF_LOG_CT_ZONE, 0, 16, ofpacts_p);
         struct zone_ids peer_zones = get_zone_ids(peer, ct_zones);
-        load_logical_ingress_metadata(peer, &peer_zones, ofpacts_p);
+        load_logical_ingress_metadata(peer, &peer_zones, n_encap_ips,
+                                      encap_ips, ofpacts_p);
         put_load(0, MFF_LOG_FLAGS, 0, 32, ofpacts_p);
         put_load(0, MFF_LOG_OUTPORT, 0, 32, ofpacts_p);
         for (int i = 0; i < MFF_N_LOG_REGS; i++) {
@@ -1697,7 +1738,8 @@  consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
          * as we're going to remove this with ofpbuf_pull() later. */
         uint32_t ofpacts_orig_size = ofpacts_p->size;
 
-        load_logical_ingress_metadata(binding, &zone_ids, ofpacts_p);
+        load_logical_ingress_metadata(binding, &zone_ids, n_encap_ips,
+                                      encap_ips, ofpacts_p);
 
         if (!strcmp(binding->type, "localport")) {
             /* mark the packet as incoming from a localport */
@@ -1904,7 +1946,7 @@  consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
     } else {
         put_remote_port_redirect_overlay(
             binding, mff_ovn_geneve, port_key, &match, ofpacts_p,
-            chassis, chassis_tunnels, flow_table);
+            chassis, chassis_tunnels, n_encap_ips, encap_ips, flow_table);
     }
 out:
     if (ha_ch_ordered) {
@@ -2102,7 +2144,7 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
 
         int zone_id = simap_get(ct_zones, port->logical_port);
         if (zone_id) {
-            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 32, &ofpacts);
+            put_load(zone_id, MFF_LOG_CT_ZONE, 0, 16, &ofpacts);
         }
 
         const char *lport_name = (port->parent_port && *port->parent_port) ?
@@ -2278,7 +2320,10 @@  physical_eval_port_binding(struct physical_ctx *p_ctx,
                           p_ctx->patch_ofports,
                           p_ctx->chassis_tunnels,
                           pb, p_ctx->chassis, &p_ctx->debug,
-                          p_ctx->if_mgr, flow_table, &ofpacts);
+                          p_ctx->if_mgr,
+                          p_ctx->n_encap_ips,
+                          p_ctx->encap_ips,
+                          flow_table, &ofpacts);
     ofpbuf_uninit(&ofpacts);
 }
 
@@ -2402,7 +2447,10 @@  physical_run(struct physical_ctx *p_ctx,
                               p_ctx->patch_ofports,
                               p_ctx->chassis_tunnels, binding,
                               p_ctx->chassis, &p_ctx->debug,
-                              p_ctx->if_mgr, flow_table, &ofpacts);
+                              p_ctx->if_mgr,
+                              p_ctx->n_encap_ips,
+                              p_ctx->encap_ips,
+                              flow_table, &ofpacts);
     }
 
     /* Handle output to multicast groups, in tables 40 and 41. */
diff --git a/controller/physical.h b/controller/physical.h
index 1f1ed55efa16..7fe8ee3c18ed 100644
--- a/controller/physical.h
+++ b/controller/physical.h
@@ -66,6 +66,8 @@  struct physical_ctx {
     struct shash *local_bindings;
     struct simap *patch_ofports;
     struct hmap *chassis_tunnels;
+    size_t n_encap_ips;
+    const char **encap_ips;
     struct physical_debug debug;
 };
 
diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h
index 272277ec4fa0..d3455a462a0d 100644
--- a/include/ovn/logical-fields.h
+++ b/include/ovn/logical-fields.h
@@ -37,7 +37,9 @@  enum ovn_controller_event {
 #define MFF_LOG_SNAT_ZONE  MFF_REG12  /* conntrack snat zone for gateway router
                                        * (32 bits). */
 #define MFF_LOG_CT_ZONE    MFF_REG13  /* Logical conntrack zone for lports
-                                       * (32 bits). */
+                                       * (0..15 of the 32 bits). */
+#define MFF_LOG_ENCAP_ID   MFF_REG13  /* Encap ID for lports
+                                       * (16..31 of the 32 bits). */
 #define MFF_LOG_INPORT     MFF_REG14  /* Logical input port (32 bits). */
 #define MFF_LOG_OUTPORT    MFF_REG15  /* Logical output port (32 bits). */
 
diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
index 96294fe2b795..bfd8680cedfc 100644
--- a/ovn-architecture.7.xml
+++ b/ovn-architecture.7.xml
@@ -1247,8 +1247,8 @@ 
       chassis.  This is initialized to 0 at the beginning of the logical
         <!-- Keep the following in sync with MFF_LOG_CT_ZONE in
              ovn/lib/logical-fields.h. -->
-      ingress pipeline.  OVN stores this in Open vSwitch extension register
-      number 13.
+      ingress pipeline.  OVN stores this in the lower 16 bits of the Open
+      vSwitch extension register number 13.
     </dd>
 
     <dt>conntrack zone fields for routers</dt>
@@ -1263,6 +1263,20 @@ 
       traffic (for SNATing) in Open vSwitch extension register number 12.
     </dd>
 
+    <dt>Encap ID for logical ports</dt>
+    <dd>
+      A field that records an ID that indicates which encapsulation IP should
+      be used when sending packets to a remote chassis, according to the
+      original input logical port. This is useful when there are multiple IPs
+      available for encapsulation. The value only has local significance and is
+      not meaningful between chassis. This is initialized to 0 at the beginning
+      of the logical
+        <!-- Keep the following in sync with MFF_LOG_ENCAP_ID in
+             ovn/lib/logical-fields.h. -->
+      ingress pipeline.  OVN stores this in the higher 16 bits of the Open
+      vSwitch extension register number 13.
+    </dd>
+
     <dt>logical flow flags</dt>
     <dd>
       The logical flags are intended to handle keeping context between
diff --git a/tests/ovn.at b/tests/ovn.at
index 243fe0b8246c..e7f0c9681f60 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -30335,19 +30335,33 @@  AT_CLEANUP
 
 
 OVN_FOR_EACH_NORTHD([
-AT_SETUP([multiple encap ips tunnel creation])
+AT_SETUP([multiple encap ips selection based on VIF's encap_ip])
+AT_SKIP_IF([test $HAVE_SCAPY = no])
 ovn_start
 net_add n1
 
+ovn-nbctl ls-add ls1
+
 # 2 HVs, each with 2 encap-ips.
+# lspXY is the Yth LSP on hvX, with encap-ip 192.168.0.XY.
 for i in 1 2; do
     sim_add hv$i
     as hv$i
     ovs-vsctl add-br br-phys-$j
     ovn_attach n1 br-phys-$j 192.168.0.${i}1
     ovs-vsctl set open . external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2
+
+    for j in 1 2; do
+        ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \
+            external_ids:iface-id=lsp$i$j \
+            external_ids:encap-ip=192.168.0.$i$j \
+            options:tx_pcap=hv$i/vif$i$j-tx.pcap options:rxq_pcap=hv$i/vif$i$j-rx.pcap
+        ovn-nbctl lsp-add ls1 lsp$i$j -- lsp-set-addresses lsp$i$j "f0:00:00:00:00:$i$j 10.0.0.$i$j"
+
+    done
 done
 
+wait_for_ports_up
 check ovn-nbctl --wait=hv sync
 
 check_tunnel_port() {
@@ -30376,6 +30390,41 @@  check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.21
 check_tunnel_port hv2 br-int hv1@192.168.0.11%192.168.0.22
 check_tunnel_port hv2 br-int hv1@192.168.0.12%192.168.0.22
 
+vif_to_hv() {
+    case $1 in dnl (
+        vif[[1]]?) echo hv1 ;; dnl (
+        vif[[2]]?) echo hv2 ;; dnl (
+        *) AT_FAIL_IF([:]) ;;
+    esac
+}
+
+# check_packet_tunnel SRC_VIF DST_VIF
+# Verify that a packet from SRC_VIF to DST_VIF goes through the corresponding
+# tunnel that matches the VIFs' encap_ip configurations.
+check_packet_tunnel() {
+    local src=$1 dst=$2
+    local src_mac=f0:00:00:00:00:$src
+    local dst_mac=f0:00:00:00:00:$dst
+    local src_ip=10.0.0.$src
+    local dst_ip=10.0.0.$dst
+    local local_encap_ip=192.168.0.$src
+    local remote_encap_ip=192.168.0.$dst
+    local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \
+                            IP(dst='${dst_ip}', src='${src_ip}')/ \
+                            ICMP(type=8)")
+    hv=`vif_to_hv vif$src`
+    as $hv
+    echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> $remote_encap_ip"
+    tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip)
+    AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet | grep "output\:" | awk -F ':' '{ print $2 }') == $tunnel_ofport])
+}
+
+for i in 1 2; do
+    for j in 1 2; do
+        check_packet_tunnel 1$i 2$j
+    done
+done
+
 OVN_CLEANUP([hv1],[hv2])
 AT_CLEANUP
 ])